NeoXArgs.from_ymls() ['../input/pythia-160m.yml'] INFO:root:NeoXArgs.calculate_derived() Total number of GPUs determined to be: 1 -------------------- arguments -------------------- attention_config ................ ['flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash', 'flash']updated attention_dropout ............... 0...........................updated batch_size ...................... 32..........................updated bias_gelu_fusion ................ True........................updated checkpoint_activations .......... True........................updated checkpoint_factor ............... 1000........................updated clip_grad ....................... 1.0.........................updated config_files .................... {'pythia-160m.yml': '{\n # parallelism settings\n "pipe-parallel-size": 1,\n "model-parallel-size": 1,\n\n # model settings\n "num-layers": 12,\n "hidden-size": 768,\n "num-attention-heads": 12,\n "seq-length": 2048,\n "max-position-embeddings": 2048,\n "pos-emb": "rotary",\n "rotary-pct": 0.25,\n "no-weight-tying": true,\n "gpt-j-residual": true,\n "output-layer-parallelism": "column",\n \n "attention-config": [[["flash"], 12]],\n\n "scaled-upper-triang-masked-softmax-fusion": true,\n "bias-gelu-fusion": true,\n\n # init methods\n "init_method": "small_init",\n "output_layer_init_method": "wang_init",\n\n "optimizer": {\n "type": "Adam",\n "params": {\n "lr": 0.0006,\n "betas": [0.9, 0.95],\n "eps": 1.0e-8\n }\n },\n "min_lr": 0.00006,\n\n "zero_optimization": {\n "stage": 1,\n "allgather_partitions": true,\n "allgather_bucket_size": 500000000,\n "overlap_comm": true,\n "reduce_scatter": true,\n "reduce_bucket_size": 500000000,\n "contiguous_gradients": true,\n "cpu_offload": false\n },\n\n # batch size (trained on 32 gpus)\n "train_micro_batch_size_per_gpu": 32,\n "gradient_accumulation_steps": 32,\n "gas": 1,\n "data-impl": "mmap",\n "num_workers": 1,\n\n # activation checkpointing\n "checkpoint-activations": true,\n "checkpoint-num-layers": 1,\n "partition-activations": true,\n "synchronize-each-layer": true,\n\n # regularization\n "gradient_clipping": 1.0,\n "weight-decay": 0.1,\n "hidden-dropout": 0,\n "attention-dropout": 0,\n\n # precision settings\n "fp16": {\n "fp16": true,\n "enabled": true,\n "loss_scale": 0,\n "loss_scale_window": 1000,\n "initial_scale_power": 12,\n "hysteresis": 2,\n "min_loss_scale": 1\n },\n\n "train-iters": 143000,\n "lr-decay-iters": 143000,\n "distributed-backend": "nccl",\n "lr-decay-style": "cosine",\n "warmup": 0.01,\n "checkpoint-factor": 1000,\n "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],\n "eval-interval": 40000,\n "eval-iters": 10,\n\n "log-interval": 10,\n "steps_per_print": 10,\n "wall_clock_breakdown": true,\n\n "train-data-paths": ["../input/pythia_mydata_idxmaps/mydata_left_text_document"],\n "valid-data-paths": ["../input/pythia_mydata_idxmaps/mydata_left_text_document"],\n "test-data-paths": ["../input/pythia_mydata_idxmaps/mydata_left_text_document"],\n\n "tokenizer-type": "HFTokenizer",\n "vocab-file": "../input/20B_tokenizer.json",\n\n "launcher": "slurm",\n "deepspeed_slurm": false,\n\n "save": "../checkpoints/mydata_left-pythia160m",\n "load": "../checkpoints/mydata_left-pythia160m",\n "checkpoint_validation_with_forward_pass": False,\n}\n'}updated data_impl ....................... mmap........................updated dynamic_loss_scale .............. True........................updated eval_interval ................... 40000.......................updated eval_iters ...................... 10..........................updated extra_save_iters ................ [0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512]updated fp16 ............................ {'fp16': True, 'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 12, 'hysteresis': 2, 'min_loss_scale': 1}updated gas ............................. 32..........................updated global_num_gpus ................. 1...........................updated gpt_j_residual .................. True........................updated gradient_accumulation_steps ..... 32..........................updated gradient_clipping ............... 1.0.........................updated hidden_dropout .................. 0...........................updated hidden_size ..................... 768.........................updated init_method ..................... small_init..................updated is_pipe_parallel ................ True........................updated launcher ........................ slurm.......................updated load ............................ ../checkpoints/mydata_left-pythia160mupdated log_interval .................... 10..........................updated lr .............................. 0.0006......................updated lr_decay_iters .................. 143000......................updated lr_decay_style .................. cosine......................updated max_position_embeddings ......... 2048........................updated min_lr .......................... 6e-05.......................updated no_weight_tying ................. True........................updated num_attention_heads ............. 12..........................updated num_layers ...................... 12..........................updated num_workers ..................... 1...........................updated optimizer ....................... {'type': 'Adam', 'params': {'lr': 0.0006, 'betas': [0.9, 0.95], 'eps': 1e-08}}updated optimizer_type .................. Adam........................updated output_layer_init_method ........ wang_init...................updated output_layer_parallelism ........ column......................updated partition_activations ........... True........................updated pipe_parallel_size .............. 1...........................updated pos_emb ......................... rotary......................updated precision ....................... fp16........................updated rotary_pct ...................... 0.25........................updated save ............................ ../checkpoints/mydata_left-pythia160mupdated save_iters ...................... [0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000]updated scaled_upper_triang_masked_softmax_fusion True...............updated seq_length ...................... 2048........................updated sparsity_config ................. {}..........................updated synchronize_each_layer .......... True........................updated test_data_paths ................. ['../input/pythia_mydata_idxmaps/mydata_left_text_document']updated test_data_weights ............... [1.0].......................updated text_gen_type ................... unconditional...............updated tokenizer_type .................. HFTokenizer.................updated train_batch_size ................ 1024........................updated train_data_paths ................ ['../input/pythia_mydata_idxmaps/mydata_left_text_document']updated train_data_weights .............. [1.0].......................updated train_iters ..................... 143000......................updated train_micro_batch_size_per_gpu .. 32..........................updated user_script ..................... train.py....................updated valid_data_paths ................ ['../input/pythia_mydata_idxmaps/mydata_left_text_document']updated valid_data_weights .............. [1.0].......................updated vocab_file ...................... ../input/20B_tokenizer.json.updated wall_clock_breakdown ............ True........................updated wandb_group ..................... jS3hHmmzm2uWaLRVKtLL8X_go87bv9rupdated weight_decay .................... 0.1.........................updated zero_allgather_bucket_size ...... 500000000...................updated zero_contiguous_gradients ....... True........................updated zero_optimization ............... {'stage': 1, 'allgather_partitions': True, 'allgather_bucket_size': 500000000, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 500000000, 'contiguous_gradients': True, 'cpu_offload': False}updated zero_reduce_bucket_size ......... 500000000...................updated zero_reduce_scatter ............. True........................updated zero_stage ...................... 1...........................updated activation ...................... gelu........................default adlr_autoresume ................. False.......................default adlr_autoresume_interval ........ 1000........................default amp ............................. None........................default apply_query_key_layer_scaling ... False.......................default attention_softmax_in_fp32 ....... False.......................default bias_dropout_fusion ............. False.......................default char_level_ppl .................. False.......................default checkpoint_in_cpu ............... False.......................default checkpoint_num_layers ........... 1...........................default checkpoint_scale ................ linear......................default checkpoint_validation_with_forward_pass False................default comment ......................... None........................default contiguous_checkpointing ........ False.......................default data_path ....................... None........................default deepscale ....................... False.......................default deepscale_config ................ None........................default deepspeed ....................... True........................default deepspeed_activation_checkpointing True......................default deepspeed_mpi ................... False.......................default deepspeed_slurm ................. False.......................default detect_nvlink_pairs ............. False.......................default distributed_backend ............. nccl........................default do_test ......................... None........................default do_train ........................ None........................default do_valid ........................ None........................default dump_state ...................... False.......................default eod_mask_loss ................... False.......................default eval_results_prefix ............. ............................default eval_tasks ...................... None........................default exclude ......................... None........................default exit_interval ................... None........................default finetune ........................ False.......................default flops_profiler .................. None........................default fp16_lm_cross_entropy ........... False.......................default fp32_allreduce .................. False.......................default git_hash ........................ 71df4d50....................default gmlp_attn_dim ................... 64..........................default gpt_j_tied ...................... False.......................default gradient_noise_scale_cpu_offload False.......................default gradient_noise_scale_n_batches .. 5...........................default gradient_predivide_factor ....... 1.0.........................default hostfile ........................ None........................default hysteresis ...................... 2...........................default include ......................... None........................default init_method_std ................. 0.02........................default iteration ....................... None........................default keep_last_n_checkpoints ......... None........................default layernorm_epsilon ............... 1e-05.......................default lazy_mpu_init ................... False.......................default local_rank ...................... None........................default log_dir ......................... None........................default log_grad_norm ................... False.......................default log_grad_pct_zeros .............. False.......................default log_gradient_noise_scale ........ False.......................default log_optimizer_states ............ False.......................default log_param_norm .................. False.......................default loss_scale ...................... None........................default loss_scale_window ............... 1000.0......................default make_vocab_size_divisible_by .... 128.........................default master_addr ..................... None........................default master_port ..................... 29500.......................default maximum_tokens .................. 64..........................default merge_file ...................... None........................default min_scale ....................... 1.0.........................default mmap_warmup ..................... False.......................default model_parallel_size ............. 1...........................default no_load_optim ................... False.......................default no_load_rng ..................... False.......................default no_save_optim ................... False.......................default no_save_rng ..................... False.......................default norm ............................ layernorm...................default num_gpus ........................ None........................default num_nodes ....................... -1..........................default num_samples ..................... 1...........................default num_unique_layers ............... None........................default onnx_safe ....................... False.......................default opt_pos_emb_offset .............. 0...........................default override_lr_scheduler ........... False.......................default padded_vocab_size ............... None........................default param_sharing_style ............. grouped.....................default pipe_partition_method ........... type:transformer|mlp........default prescale_gradients .............. False.......................default profile_backward ................ False.......................default prompt_end ...................... ...........................default rank ............................ None........................default recompute ....................... False.......................default return_logits ................... False.......................default rms_norm_epsilon ................ 1e-08.......................default rotary_emb_base ................. 10000.......................default rpe_max_distance ................ 128.........................default rpe_num_buckets ................. 32..........................default sample_input_file ............... None........................default sample_output_file .............. samples.txt.................default scaled_masked_softmax_fusion .... False.......................default scalenorm_epsilon ............... 1e-08.......................default scheduler ....................... None........................default seed ............................ 1234........................default short_seq_prob .................. 0.1.........................default soft_prompt_tuning .............. None........................default sparse_gradients ................ False.......................default split ........................... 969, 30, 1..................default steps_per_print ................. 10..........................default temperature ..................... 0.0.........................default tensorboard_dir ................. None........................default top_k ........................... 0...........................default top_p ........................... 0.0.........................default use_bnb_optimizer ............... False.......................default use_checkpoint_lr_scheduler ..... False.......................default use_cpu_initialization .......... False.......................default use_shared_fs ................... True........................default use_wandb ....................... None........................default wandb_host ...................... https://api.wandb.ai........default wandb_init_all_ranks ............ False.......................default wandb_project ................... neox........................default wandb_team ...................... None........................default warmup .......................... 0.01........................default weight_by_num_documents ......... False.......................default weighted_sampler_alpha .......... 0.3.........................default world_size ...................... None........................default zero_allow_untested_optimizer ... False.......................default ---------------- end of arguments ---------------- [2025-03-25 08:31:16,936] [WARNING] [runner.py:126:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. [2025-03-25 08:31:16,936] [INFO] [runner.py:366:main] cmd = /usr/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 train.py --deepspeed_config {"train_batch_size": 1024, "train_micro_batch_size_per_gpu": 32, "gradient_accumulation_steps": 32, "optimizer": {"type": "Adam", "params": {"lr": 0.0006, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true} --megatron_config {"launcher": "slurm", "train_batch_size": 1024, "train_micro_batch_size_per_gpu": 32, "gradient_accumulation_steps": 32, "optimizer": {"type": "Adam", "params": {"lr": 0.0006, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "precision": "fp16", "num_layers": 12, "hidden_size": 768, "num_attention_heads": 12, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash"], "sparsity_config": {}, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, "rotary_pct": 0.25, "init_method": "small_init", "output_layer_init_method": "wang_init", "gpt_j_residual": true, "output_layer_parallelism": "column", "lr_decay_style": "cosine", "lr_decay_iters": 143000, "min_lr": 6e-05, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.0006, "tokenizer_type": "HFTokenizer", "train_data_paths": ["../input/pythia_mydata_idxmaps/mydata_left_text_document"], "test_data_paths": ["../input/pythia_mydata_idxmaps/mydata_left_text_document"], "valid_data_paths": ["../input/pythia_mydata_idxmaps/mydata_left_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "data_impl": "mmap", "save": "../checkpoints/mydata_left-pythia160m", "config_files": {"pythia-160m.yml": "{\n # parallelism settings\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 12,\n \"hidden-size\": 768,\n \"num-attention-heads\": 12,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"pos-emb\": \"rotary\",\n \"rotary-pct\": 0.25,\n \"no-weight-tying\": true,\n \"gpt-j-residual\": true,\n \"output-layer-parallelism\": \"column\",\n \n \"attention-config\": [[[\"flash\"], 12]],\n\n \"scaled-upper-triang-masked-softmax-fusion\": true,\n \"bias-gelu-fusion\": true,\n\n # init methods\n \"init_method\": \"small_init\",\n \"output_layer_init_method\": \"wang_init\",\n\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 0.0006,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8\n }\n },\n \"min_lr\": 0.00006,\n\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": true,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": true,\n \"reduce_scatter\": true,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": true,\n \"cpu_offload\": false\n },\n\n # batch size (trained on 32 gpus)\n \"train_micro_batch_size_per_gpu\": 32,\n \"gradient_accumulation_steps\": 32,\n \"gas\": 1,\n \"data-impl\": \"mmap\",\n \"num_workers\": 1,\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.1,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"loss_scale_window\": 1000,\n \"initial_scale_power\": 12,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n \"train-iters\": 143000,\n \"lr-decay-iters\": 143000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"checkpoint-factor\": 1000,\n \"extra-save-iters\": [0,1,2,4,8,16,32,64,128,256,512],\n \"eval-interval\": 40000,\n \"eval-iters\": 10,\n\n \"log-interval\": 10,\n \"steps_per_print\": 10,\n \"wall_clock_breakdown\": true,\n\n \"train-data-paths\": [\"../input/pythia_mydata_idxmaps/mydata_left_text_document\"],\n \"valid-data-paths\": [\"../input/pythia_mydata_idxmaps/mydata_left_text_document\"],\n \"test-data-paths\": [\"../input/pythia_mydata_idxmaps/mydata_left_text_document\"],\n\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"../input/20B_tokenizer.json\",\n\n \"launcher\": \"slurm\",\n \"deepspeed_slurm\": false,\n\n \"save\": \"../checkpoints/mydata_left-pythia160m\",\n \"load\": \"../checkpoints/mydata_left-pythia160m\",\n \"checkpoint_validation_with_forward_pass\": False,\n}\n"}, "load": "../checkpoints/mydata_left-pythia160m", "checkpoint_factor": 1000, "extra_save_iters": [0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512], "batch_size": 32, "train_iters": 143000, "eval_iters": 10, "eval_interval": 40000, "vocab_file": "../input/20B_tokenizer.json", "num_workers": 1, "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0.1, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 32, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 1, "is_pipe_parallel": true, "wandb_group": "jS3hHmmzm2uWaLRVKtLL8X_go87bv9r", "log_interval": 10, "text_gen_type": "unconditional", "user_script": "train.py", "save_iters": [0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], "global_num_gpus": 1} [2025-03-25 08:31:17,787] [INFO] [launch.py:82:main] WORLD INFO DICT: {'localhost': [0]} [2025-03-25 08:31:17,787] [INFO] [launch.py:88:main] nnodes=1, num_local_procs=1, node_rank=0 [2025-03-25 08:31:17,787] [INFO] [launch.py:103:main] global_rank_mapping=defaultdict(, {'localhost': [0]}) [2025-03-25 08:31:17,787] [INFO] [launch.py:104:main] dist_world_size=1 [2025-03-25 08:31:17,787] [INFO] [launch.py:112:main] Setting CUDA_VISIBLE_DEVICES=0 NeoXArgs.configure_distributed_args() using world size: 1 and model-parallel size: 1 > building HFTokenizer tokenizer ... > padded vocab (size: 50277) with 27 dummy tokens (new size: 50304) > initializing torch distributed ... [2025-03-25 08:31:20,643] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl > initializing model parallel with size 1 MPU DP: [0] MPU PP: [0] MPU MP: [0] > setting random seeds to 1234 ... [2025-03-25 08:31:20,644] [INFO] [checkpointing.py:223:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 make: Entering directory '/data/dusi/pythia-retrain/gpt-neox/megatron/data' make: Nothing to be done for 'default'. make: Leaving directory '/data/dusi/pythia-retrain/gpt-neox/megatron/data' building GPT2 model ... SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0} [2025-03-25 08:31:31,759] [INFO] [module.py:363:_partition_layers] Partitioning pipeline stages with method type:transformer|mlp stage=0 layers=17 0: EmbeddingPipe 1: _pre_transformer_block 2: ParallelTransformerLayerPipe 3: ParallelTransformerLayerPipe 4: ParallelTransformerLayerPipe 5: ParallelTransformerLayerPipe 6: ParallelTransformerLayerPipe 7: ParallelTransformerLayerPipe 8: ParallelTransformerLayerPipe 9: ParallelTransformerLayerPipe 10: ParallelTransformerLayerPipe 11: ParallelTransformerLayerPipe 12: ParallelTransformerLayerPipe 13: ParallelTransformerLayerPipe 14: _post_transformer_block 15: NormPipe 16: ParallelLinearPipe loss: partial Configuring Optimizer type: Adam with params: {'lr': 0.0006, 'betas': [0.9, 0.95], 'eps': 1e-08} > learning rate decay style: cosine DeepSpeed is enabled. [2025-03-25 08:31:31,890] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.15+eb7f5cf, git-hash=eb7f5cf, git-branch=HEAD [2025-03-25 08:31:31,890] [WARNING] [config.py:77:_sanity_check] DeepSpeedConfig: cpu_offload is deprecated. Please use offload_optimizer. [2025-03-25 08:31:32,208] [INFO] [engine.py:654:_configure_optimizer] Removing param_group that has no 'params' in the client Optimizer [2025-03-25 08:31:32,208] [INFO] [engine.py:659:_configure_optimizer] Using client Optimizer as basic optimizer [2025-03-25 08:31:32,209] [INFO] [engine.py:668:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam Checking ZeRO support for optimizer=FusedAdam type= [2025-03-25 08:31:32,209] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer Using /root/.cache/torch_extensions as PyTorch extensions root... Emitting ninja build file /root/.cache/torch_extensions/utils/build.ninja... Building extension module utils... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) ninja: no work to do. Loading extension module utils... Time to load utils op: 0.6376595497131348 seconds [2025-03-25 08:31:32,848] [INFO] [stage1.py:160:__init__] ZeRO Elastic Checkpoint = True [2025-03-25 08:31:32,848] [INFO] [logging.py:60:log_dist] [Rank 0] Using default max_elements_per_comm 500000000 [2025-03-25 08:31:32,848] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 162201600, max elements per com: 500000000 [2025-03-25 08:31:32,848] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 162201600, padding: 0 [2025-03-25 08:31:32,848] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 162201600 + 0 = 162201600 [2025-03-25 08:31:33,028] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:33,028] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=162201600 [2025-03-25 08:31:33,028] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:33,028] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=162201600 [2025-03-25 08:31:33,028] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=162201600 [2025-03-25 08:31:33,028] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:33,028] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:33,029] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:33,050] [INFO] [logging.py:60:log_dist] [Rank 0] Using default max_elements_per_comm 500000000 [2025-03-25 08:31:33,050] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 121344, max elements per com: 500000000 [2025-03-25 08:31:33,050] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 121344, padding: 0 [2025-03-25 08:31:33,050] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 121344 + 0 = 121344 [2025-03-25 08:31:33,051] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:33,051] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=121344 [2025-03-25 08:31:33,051] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:33,051] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=121344 [2025-03-25 08:31:33,051] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=121344 [2025-03-25 08:31:33,051] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:33,051] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:33,051] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:33,376] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam [2025-03-25 08:31:33,376] [INFO] [engine.py:498:_configure_lr_scheduler] DeepSpeed using client LR scheduler [2025-03-25 08:31:33,376] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = [2025-03-25 08:31:33,376] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[[0.9, 0.95], [0.9, 0.95]] [2025-03-25 08:31:33,376] [INFO] [config.py:759:print] DeepSpeedEngine configuration: [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, "number_checkpoints": null, "synchronize_checkpoint_boundary": false, "profile": false } [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] allreduce_always_fp32 ........ False [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] amp_enabled .................. False [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] amp_params ................... False [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] checkpoint_tag_validation_enabled True [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] checkpoint_tag_validation_fail False [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] disable_allgather ............ False [2025-03-25 08:31:33,376] [INFO] [config.py:763:print] dump_state ................... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1} [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] elasticity_enabled ........... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] flops_profiler_config ........ { "enabled": false, "profile_step": 1, "module_depth": -1, "top_modules": 3, "detailed": true } [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] fp16_enabled ................. True [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] fp16_type .................... fp16 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] global_rank .................. 0 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] gradient_accumulation_steps .. 32 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] gradient_clipping ............ 1.0 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] gradient_predivide_factor .... 1.0 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] initial_dynamic_scale ........ 4096 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] loss_scale ................... 0 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] memory_breakdown ............. False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] optimizer_legacy_fusion ...... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] optimizer_name ............... adam [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] optimizer_params ............. {'lr': 0.0006, 'betas': [0.9, 0.95], 'eps': 1e-08} [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] pld_enabled .................. False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] pld_params ................... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] precision .................... torch.float16 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] prescale_gradients ........... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] scheduler_name ............... None [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] scheduler_params ............. None [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] sparse_attention ............. None [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] sparse_gradients_enabled ..... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] steps_per_print .............. 10 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] tensorboard_enabled .......... False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] tensorboard_job_name ......... DeepSpeedJobName [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] tensorboard_output_path ...... [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] train_batch_size ............. 1024 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] train_micro_batch_size_per_gpu 32 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] wall_clock_breakdown ......... True [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] world_size ................... 1 [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] zero_allow_untested_optimizer False [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] zero_config .................. { "stage": 1, "contiguous_gradients": true, "reduce_scatter": true, "reduce_bucket_size": 5.000000e+08, "allgather_partitions": true, "allgather_bucket_size": 5.000000e+08, "overlap_comm": true, "load_from_fp32_weights": true, "elastic_checkpoint": true, "offload_param": null, "offload_optimizer": null, "sub_group_size": 1.000000e+12, "prefetch_bucket_size": 5.000000e+07, "param_persistence_threshold": 1.000000e+05, "max_live_parameters": 1.000000e+09, "max_reuse_distance": 1.000000e+09, "gather_fp16_weights_on_model_save": false } [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] zero_enabled ................. True [2025-03-25 08:31:33,377] [INFO] [config.py:763:print] zero_optimization_stage ...... 1 [2025-03-25 08:31:33,378] [INFO] [config.py:765:print] json = { "train_batch_size": 1.024000e+03, "train_micro_batch_size_per_gpu": 32, "gradient_accumulation_steps": 32, "optimizer": { "type": "Adam", "params": { "lr": 0.0006, "betas": [0.9, 0.95], "eps": 1e-08 } }, "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 1, "allgather_partitions": true, "allgather_bucket_size": 5.000000e+08, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5.000000e+08, "contiguous_gradients": true, "cpu_offload": false }, "wall_clock_breakdown": true } Using /root/.cache/torch_extensions as PyTorch extensions root... No modifications detected for re-loaded extension module utils, skipping build step... Loading extension module utils... Time to load utils op: 0.0004591941833496094 seconds [2025-03-25 08:31:33,379] [INFO] [engine.py:84:__init__] CONFIG: micro_batches=32 micro_batch_size=32 [2025-03-25 08:31:33,411] [INFO] [engine.py:141:__init__] RANK=0 STAGE=0 LAYERS=17 [0, 17) STAGE_PARAMS=162322944 (162.323M) TOTAL_PARAMS=162322944 (162.323M) UNIQUE_PARAMS=162322944 (162.323M) > number of parameters on model parallel rank 0: 162322944 > total params: 162,322,944 [2025-03-25 08:31:33,456] [INFO] [engine.py:1551:_load_checkpoint] rank: 0 loading checkpoint: ../checkpoints/mydata_left-pythia160m/global_step512/mp_rank_00_model_states.pt [2025-03-25 08:31:33,499] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=0 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_00-model_00-model_states.pt [2025-03-25 08:31:33,508] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=2 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_02-model_00-model_states.pt [2025-03-25 08:31:33,514] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=3 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_03-model_00-model_states.pt [2025-03-25 08:31:33,520] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=4 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_04-model_00-model_states.pt [2025-03-25 08:31:33,539] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=5 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_05-model_00-model_states.pt [2025-03-25 08:31:33,545] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=6 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_06-model_00-model_states.pt [2025-03-25 08:31:33,552] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=7 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_07-model_00-model_states.pt [2025-03-25 08:31:33,557] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=8 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_08-model_00-model_states.pt [2025-03-25 08:31:33,563] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=9 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_09-model_00-model_states.pt [2025-03-25 08:31:33,569] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=10 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_10-model_00-model_states.pt [2025-03-25 08:31:33,575] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=11 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_11-model_00-model_states.pt [2025-03-25 08:31:33,580] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=12 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_12-model_00-model_states.pt [2025-03-25 08:31:33,586] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=13 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_13-model_00-model_states.pt [2025-03-25 08:31:33,587] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=15 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_15-model_00-model_states.pt [2025-03-25 08:31:33,627] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=16 file=../checkpoints/mydata_left-pythia160m/global_step512/layer_16-model_00-model_states.pt > using checkpoint value 0.0006 for learning rate > using checkpoint value 6e-05 for minimum learning rate > using checkpoint value 1430.0 for warmup iterations > using checkpoint value 143000 for total number of iterations > using checkpoint value cosine for decay style successfully loaded 1 ZeRO state_dicts for rank 0 [2025-03-25 08:31:34,510] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 162201600, max elements per com: 500000000 [2025-03-25 08:31:34,510] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 162201600, padding: 0 [2025-03-25 08:31:34,510] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 162201600 + 0 = 162201600 [2025-03-25 08:31:34,511] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:34,511] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=162201600 [2025-03-25 08:31:34,511] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:34,511] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=162201600 [2025-03-25 08:31:34,511] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=162201600 [2025-03-25 08:31:34,511] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:34,511] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:34,511] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:34,511] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 162201600, max elements per com: 500000000 [2025-03-25 08:31:34,511] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 162201600, padding: 0 [2025-03-25 08:31:34,511] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 162201600 + 0 = 162201600 [2025-03-25 08:31:34,511] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:34,512] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=162201600 [2025-03-25 08:31:34,512] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:34,512] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=162201600 [2025-03-25 08:31:34,512] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=162201600 [2025-03-25 08:31:34,512] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:34,512] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:34,512] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:34,512] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 121344, max elements per com: 500000000 [2025-03-25 08:31:34,512] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 121344, padding: 0 [2025-03-25 08:31:34,512] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 121344 + 0 = 121344 [2025-03-25 08:31:34,512] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:34,512] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=121344 [2025-03-25 08:31:34,512] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:34,512] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=121344 [2025-03-25 08:31:34,513] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=121344 [2025-03-25 08:31:34,513] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:34,513] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:34,513] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:34,513] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 121344, max elements per com: 500000000 [2025-03-25 08:31:34,513] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 121344, padding: 0 [2025-03-25 08:31:34,513] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 121344 + 0 = 121344 [2025-03-25 08:31:34,513] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:34,513] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=121344 [2025-03-25 08:31:34,513] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:34,513] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=121344 [2025-03-25 08:31:34,513] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=121344 [2025-03-25 08:31:34,513] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:34,513] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:34,514] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:34,601] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 162201600, max elements per com: 500000000 [2025-03-25 08:31:34,601] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 162201600, padding: 0 [2025-03-25 08:31:34,601] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 162201600 + 0 = 162201600 [2025-03-25 08:31:34,601] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:34,601] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=162201600 [2025-03-25 08:31:34,601] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:34,601] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=162201600 [2025-03-25 08:31:34,601] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=162201600 [2025-03-25 08:31:34,601] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:34,602] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:34,602] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** [2025-03-25 08:31:34,602] [INFO] [logging.py:60:log_dist] [Rank 0] Total number of elements in model: 121344, max elements per com: 500000000 [2025-03-25 08:31:34,602] [INFO] [logging.py:60:log_dist] [Rank 0] sub_partition_count: 1, sub_partition_size: 121344, padding: 0 [2025-03-25 08:31:34,602] [INFO] [logging.py:60:log_dist] [Rank 0] number of elements with padding: 121344 + 0 = 121344 [2025-03-25 08:31:34,602] [INFO] [stage1.py:375:get_data_parallel_sub_partitions] **** partition info: [2025-03-25 08:31:34,602] [INFO] [stage1.py:376:get_data_parallel_sub_partitions] total_num_elements=121344 [2025-03-25 08:31:34,602] [INFO] [stage1.py:377:get_data_parallel_sub_partitions] world_size=1 [2025-03-25 08:31:34,602] [INFO] [stage1.py:378:get_data_parallel_sub_partitions] max_elements_per_comm=121344 [2025-03-25 08:31:34,602] [INFO] [stage1.py:379:get_data_parallel_sub_partitions] sub_partition_size=121344 [2025-03-25 08:31:34,602] [INFO] [stage1.py:380:get_data_parallel_sub_partitions] num_sub_partitions=1 [2025-03-25 08:31:34,602] [INFO] [stage1.py:381:get_data_parallel_sub_partitions] num_comm_intervals=1 [2025-03-25 08:31:34,603] [INFO] [stage1.py:382:get_data_parallel_sub_partitions] **** loading 1 zero partition checkpoints for rank 0 > validated currently set args with arguments in the checkpoint ... successfully loaded ../checkpoints/mydata_left-pythia160m/global_step512/mp_rank_00_model_states.pt Loading checkpoint and starting from iteration 512 > building train, validation, and test datasets ... reading sizes... reading pointers... reading document index... creating numpy buffer of mmap... creating memory view of numpy buffer... train_0: no. of documents:281298 > loading doc-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_train_0_indexmap_147164160ns_2048sl_1234s_doc_idx.npy > loading sample-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_train_0_indexmap_147164160ns_2048sl_1234s_sample_idx.npy > loading shuffle-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_train_0_indexmap_147164160ns_2048sl_1234s_shuffle_idx.npy loaded indexed file in 0.004 seconds total number of samples: 147236903 total number of epochs: 1203 WARNING: shuffle index length (147236901) is not equal to sample index length (147236902) reading sizes... reading pointers... reading document index... creating numpy buffer of mmap... creating memory view of numpy buffer... valid_0: no. of documents:281298 > loading doc-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_valid_0_indexmap_41165ns_2048sl_1234s_doc_idx.npy > loading sample-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_valid_0_indexmap_41165ns_2048sl_1234s_sample_idx.npy > loading shuffle-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_valid_0_indexmap_41165ns_2048sl_1234s_shuffle_idx.npy loaded indexed file in 0.376 seconds total number of samples: 122392 total number of epochs: 1 WARNING: shuffle index length (122390) is not equal to sample index length (122391) reading sizes... reading pointers... reading document index... creating numpy buffer of mmap... creating memory view of numpy buffer... test_0: no. of documents:281298 > loading doc-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_test_0_indexmap_10292ns_2048sl_1234s_doc_idx.npy > loading sample-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_test_0_indexmap_10292ns_2048sl_1234s_sample_idx.npy > loading shuffle-idx mapping from ../input/pythia_mydata_idxmaps/mydata_left_text_document_test_0_indexmap_10292ns_2048sl_1234s_shuffle_idx.npy loaded indexed file in 3.370 seconds total number of samples: 122392 total number of epochs: 1 WARNING: shuffle index length (122390) is not equal to sample index length (122391) > building indices for blendable datasets ... > sample ratios: dataset 0, input: 1, achieved: 1 > RANK 0 elapsed time for building blendable dataset indices: 0.93 (sec) > building indices for blendable datasets ... > sample ratios: dataset 0, input: 1, achieved: 1 > RANK 0 elapsed time for building blendable dataset indices: 0.00 (sec) > building indices for blendable datasets ... > sample ratios: dataset 0, input: 1, achieved: 1 > RANK 0 elapsed time for building blendable dataset indices: 0.00 (sec) setting training data start iteration to 16384 setting validation data start iteration to 0 done with setups ... time (ms) | model and optimizer: 3013.85 | train/valid/test data iterators: 5550.07 training ... [2025-03-25 08:31:40,450] [INFO] [checkpointing.py:405:forward] Activation Checkpointing Information [2025-03-25 08:31:40,450] [INFO] [checkpointing.py:406:forward] ----Partition Activations True, CPU CHECKPOINTING False [2025-03-25 08:31:40,450] [INFO] [checkpointing.py:409:forward] ----contiguous Memory Checkpointing False with 12 total layers [2025-03-25 08:31:40,450] [INFO] [checkpointing.py:412:forward] ----Synchronization True [2025-03-25 08:31:40,450] [INFO] [checkpointing.py:413:forward] ----Profiling False [2025-03-25 08:39:52,194] [INFO] [logging.py:60:log_dist] [Rank 0] step=520, skipped=0, lr=[0.00021818181818181818, 0.00021818181818181818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 520 loss: 4.9308 iter time (s): 49.187 samples/sec: 20.819 %comms: 0.00295660488405902 %optimizer_step 0.05866385708868236 %forward: 24.02070584717747 %backward: 63.53436379076071 [2025-03-25 08:39:52,195] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13964.63 | forward: 118150.05 | backward_microstep: 312510.87 | backward: 312504.91 | backward_inner_microstep: 312492.31 | backward_inner: 312487.54 | backward_allreduce_microstep: 6.08 | backward_allreduce: 2.09 | reduce_tied_grads: 0.32 | comms: 14.54 | reduce_grads: 0.15 | step: 288.55 | _step_clipping: 0.12 | _step_step: 287.03 | _step_zero_grad: 0.41 | _step_check_overflow: 0.51 samples/sec: 20.818 | iteration 520/ 143000 | elapsed time per iteration (ms): 49187.2 | learning rate: 2.182E-04 | approx flops per GPU: 89.8TFLOPS | lm_loss: 3.972491E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | after 520 iterations memory (MB) | allocated: 14757.37890625 | max allocated: 35286.4248046875 | reserved: 37280.0 | max reserved: 37280.0 time (ms) [2025-03-25 08:50:08,317] [INFO] [logging.py:60:log_dist] [Rank 0] step=530, skipped=0, lr=[0.00022237762237762234, 0.00022237762237762234], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 530 loss: 4.8891 iter time (s): 61.612 samples/sec: 16.620 %comms: 0.0029538972771398324 %optimizer_step 0.05736579650007581 %forward: 23.58502321421604 %backward: 63.27743614368717 [2025-03-25 08:50:08,317] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22151.37 | forward: 145311.18 | backward_microstep: 389870.39 | backward: 389862.62 | backward_inner_microstep: 389846.96 | backward_inner: 389840.68 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.61 | reduce_tied_grads: 0.34 | comms: 18.20 | reduce_grads: 0.19 | step: 353.44 | _step_clipping: 0.14 | _step_step: 351.62 | _step_zero_grad: 0.48 | _step_check_overflow: 0.63 samples/sec: 16.620 | iteration 530/ 143000 | elapsed time per iteration (ms): 61612.3 | learning rate: 2.224E-04 | approx flops per GPU: 71.7TFLOPS | lm_loss: 4.916033E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 09:00:19,323] [INFO] [logging.py:60:log_dist] [Rank 0] step=540, skipped=0, lr=[0.00022657342657342655, 0.00022657342657342655], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 540 loss: 4.8659 iter time (s): 61.100 samples/sec: 16.759 %comms: 0.002973363275000188 %optimizer_step 0.05675600646676759 %forward: 23.773955120971955 %backward: 63.825730561529795 [2025-03-25 09:00:19,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16989.14 | forward: 145258.93 | backward_microstep: 389983.12 | backward: 389975.38 | backward_inner_microstep: 389959.99 | backward_inner: 389954.05 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.52 | reduce_tied_grads: 0.35 | comms: 18.17 | reduce_grads: 0.19 | step: 346.78 | _step_clipping: 0.14 | _step_step: 345.00 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.759 | iteration 540/ 143000 | elapsed time per iteration (ms): 61100.6 | learning rate: 2.266E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.875730E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 09:10:30,682] [INFO] [logging.py:60:log_dist] [Rank 0] step=550, skipped=0, lr=[0.00023076923076923074, 0.00023076923076923074], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 550 loss: 4.8720 iter time (s): 61.135 samples/sec: 16.750 %comms: 0.002993835581470072 %optimizer_step 0.05713624919524289 %forward: 23.77043451898166 %backward: 63.7800433241329 [2025-03-25 09:10:30,683] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17314.22 | forward: 145321.37 | backward_microstep: 389930.99 | backward: 389921.50 | backward_inner_microstep: 389905.54 | backward_inner: 389899.41 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.78 | reduce_tied_grads: 0.33 | comms: 18.30 | reduce_grads: 0.20 | step: 349.30 | _step_clipping: 0.14 | _step_step: 347.59 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.750 | iteration 550/ 143000 | elapsed time per iteration (ms): 61135.9 | learning rate: 2.308E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.863257E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 09:20:41,169] [INFO] [logging.py:60:log_dist] [Rank 0] step=560, skipped=0, lr=[0.00023496503496503495, 0.00023496503496503495], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 560 loss: 4.8222 iter time (s): 61.048 samples/sec: 16.774 %comms: 0.002945621996842174 %optimizer_step 0.05717390434059526 %forward: 23.802486119102944 %backward: 63.867747611679206 [2025-03-25 09:20:41,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16482.39 | forward: 145309.82 | backward_microstep: 389910.11 | backward: 389900.90 | backward_inner_microstep: 389884.83 | backward_inner: 389878.53 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.61 | reduce_tied_grads: 0.35 | comms: 17.98 | reduce_grads: 0.20 | step: 349.04 | _step_clipping: 0.14 | _step_step: 347.38 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.773 | iteration 560/ 143000 | elapsed time per iteration (ms): 61048.7 | learning rate: 2.350E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.845221E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 09:30:52,972] [INFO] [logging.py:60:log_dist] [Rank 0] step=570, skipped=0, lr=[0.00023916083916083913, 0.00023916083916083913], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 570 loss: 4.8072 iter time (s): 61.180 samples/sec: 16.738 %comms: 0.002941977568452387 %optimizer_step 0.05772932924318653 %forward: 23.76604711773583 %backward: 63.732064391076634 [2025-03-25 09:30:52,972] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17682.88 | forward: 145399.99 | backward_microstep: 389920.11 | backward: 389910.93 | backward_inner_microstep: 389894.62 | backward_inner: 389888.33 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.65 | reduce_tied_grads: 0.36 | comms: 18.00 | reduce_grads: 0.22 | step: 353.19 | _step_clipping: 0.14 | _step_step: 351.55 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.737 | iteration 570/ 143000 | elapsed time per iteration (ms): 61180.2 | learning rate: 2.392E-04 | approx flops per GPU: 72.2TFLOPS | lm_loss: 4.840931E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 09:41:04,867] [INFO] [logging.py:60:log_dist] [Rank 0] step=580, skipped=0, lr=[0.00024335664335664334, 0.00024335664335664334], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 580 loss: 4.8193 iter time (s): 61.189 samples/sec: 16.735 %comms: 0.0029411444866037323 %optimizer_step 0.056765923811373326 %forward: 23.762510262186456 %backward: 63.73267568028736 [2025-03-25 09:41:04,867] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17711.41 | forward: 145400.27 | backward_microstep: 389982.75 | backward: 389973.45 | backward_inner_microstep: 389955.40 | backward_inner: 389948.89 | backward_allreduce_microstep: 9.57 | backward_allreduce: 4.42 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.19 | step: 347.34 | _step_clipping: 0.12 | _step_step: 345.68 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.735 | iteration 580/ 143000 | elapsed time per iteration (ms): 61189.5 | learning rate: 2.434E-04 | approx flops per GPU: 72.2TFLOPS | lm_loss: 4.830798E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 09:51:16,140] [INFO] [logging.py:60:log_dist] [Rank 0] step=590, skipped=0, lr=[0.0002475524475524475, 0.0002475524475524475], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 590 loss: 4.7830 iter time (s): 61.127 samples/sec: 16.752 %comms: 0.002991017266252803 %optimizer_step 0.06062019915669258 %forward: 23.801534182679447 %backward: 63.789365093295494 [2025-03-25 09:51:16,141] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17041.26 | forward: 145491.14 | backward_microstep: 389933.83 | backward: 389923.92 | backward_inner_microstep: 389907.57 | backward_inner: 389901.25 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.67 | reduce_tied_grads: 0.35 | comms: 18.28 | reduce_grads: 0.20 | step: 370.55 | _step_clipping: 0.14 | _step_step: 368.67 | _step_zero_grad: 0.56 | _step_check_overflow: 0.55 samples/sec: 16.752 | iteration 590/ 143000 | elapsed time per iteration (ms): 61127.4 | learning rate: 2.476E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.806562E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 10:01:30,244] [INFO] [logging.py:60:log_dist] [Rank 0] step=600, skipped=0, lr=[0.00025174825174825174, 0.00025174825174825174], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 600 loss: 4.8026 iter time (s): 61.410 samples/sec: 16.675 %comms: 0.0030412172135504826 %optimizer_step 0.058531075554504486 %forward: 23.680526915471564 %backward: 63.473734153832694 [2025-03-25 10:01:30,244] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20092.96 | forward: 145421.55 | backward_microstep: 389799.36 | backward: 389790.69 | backward_inner_microstep: 389774.34 | backward_inner: 389768.01 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.69 | reduce_tied_grads: 0.33 | comms: 18.68 | reduce_grads: 0.19 | step: 359.44 | _step_clipping: 0.12 | _step_step: 357.82 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.675 | iteration 600/ 143000 | elapsed time per iteration (ms): 61410.3 | learning rate: 2.517E-04 | approx flops per GPU: 71.9TFLOPS | lm_loss: 4.796307E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 10:11:38,788] [INFO] [logging.py:60:log_dist] [Rank 0] step=610, skipped=0, lr=[0.00025594405594405595, 0.00025594405594405595], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 610 loss: 4.7614 iter time (s): 60.854 samples/sec: 16.827 %comms: 0.002963880358772871 %optimizer_step 0.05730491266197992 %forward: 23.85028655712094 %backward: 64.04123071558395 [2025-03-25 10:11:38,789] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14935.28 | forward: 145138.28 | backward_microstep: 389722.62 | backward: 389715.81 | backward_inner_microstep: 389700.22 | backward_inner: 389694.30 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.61 | reduce_tied_grads: 0.35 | comms: 18.04 | reduce_grads: 0.20 | step: 348.72 | _step_clipping: 0.13 | _step_step: 346.96 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.827 | iteration 610/ 143000 | elapsed time per iteration (ms): 60854.5 | learning rate: 2.559E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 4.783904E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 10:21:47,570] [INFO] [logging.py:60:log_dist] [Rank 0] step=620, skipped=0, lr=[0.0002601398601398601, 0.0002601398601398601], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 620 loss: 4.7628 iter time (s): 60.878 samples/sec: 16.821 %comms: 0.0029432186236171888 %optimizer_step 0.05715582372215851 %forward: 23.84456588617181 %backward: 64.01765522150122 [2025-03-25 10:21:47,571] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15149.62 | forward: 145160.19 | backward_microstep: 389731.72 | backward: 389724.65 | backward_inner_microstep: 389708.90 | backward_inner: 389702.94 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.62 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 347.95 | _step_clipping: 0.12 | _step_step: 346.35 | _step_zero_grad: 0.48 | _step_check_overflow: 0.46 samples/sec: 16.820 | iteration 620/ 143000 | elapsed time per iteration (ms): 60878.2 | learning rate: 2.601E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 4.773055E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 10:31:56,413] [INFO] [logging.py:60:log_dist] [Rank 0] step=630, skipped=0, lr=[0.0002643356643356643, 0.0002643356643356643], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 630 loss: 4.7492 iter time (s): 60.883 samples/sec: 16.819 %comms: 0.0029372035134524153 %optimizer_step 0.057480765351456485 %forward: 23.859546169660018 %backward: 64.0158886431223 [2025-03-25 10:31:56,414] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15072.25 | forward: 145264.15 | backward_microstep: 389755.26 | backward: 389748.14 | backward_inner_microstep: 389732.45 | backward_inner: 389726.53 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.19 | step: 349.96 | _step_clipping: 0.11 | _step_step: 348.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.819 | iteration 630/ 143000 | elapsed time per iteration (ms): 60884.3 | learning rate: 2.643E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 4.756973E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 10:42:04,100] [INFO] [logging.py:60:log_dist] [Rank 0] step=640, skipped=0, lr=[0.00026853146853146847, 0.00026853146853146847], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 640 loss: 4.7088 iter time (s): 60.768 samples/sec: 16.851 %comms: 0.002969437220907452 %optimizer_step 0.05721960554476184 %forward: 23.885956677254715 %backward: 64.1257573004081 [2025-03-25 10:42:04,100] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14096.00 | forward: 145150.46 | backward_microstep: 389686.93 | backward: 389680.14 | backward_inner_microstep: 389662.74 | backward_inner: 389656.87 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.63 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.20 | step: 347.71 | _step_clipping: 0.12 | _step_step: 346.09 | _step_zero_grad: 0.49 | _step_check_overflow: 0.46 samples/sec: 16.851 | iteration 640/ 143000 | elapsed time per iteration (ms): 60768.7 | learning rate: 2.685E-04 | approx flops per GPU: 72.7TFLOPS | lm_loss: 4.743563E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 10:52:12,949] [INFO] [logging.py:60:log_dist] [Rank 0] step=650, skipped=0, lr=[0.0002727272727272727, 0.0002727272727272727], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 650 loss: 4.7017 iter time (s): 60.884 samples/sec: 16.819 %comms: 0.002941835001663463 %optimizer_step 0.057634825825335086 %forward: 23.862986684896477 %backward: 64.01648155178893 [2025-03-25 10:52:12,950] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15040.15 | forward: 145288.44 | backward_microstep: 389767.30 | backward: 389760.72 | backward_inner_microstep: 389745.14 | backward_inner: 389739.29 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 17.91 | reduce_grads: 0.20 | step: 350.91 | _step_clipping: 0.13 | _step_step: 349.32 | _step_zero_grad: 0.46 | _step_check_overflow: 0.45 samples/sec: 16.819 | iteration 650/ 143000 | elapsed time per iteration (ms): 60884.9 | learning rate: 2.727E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 4.722232E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 11:02:20,886] [INFO] [logging.py:60:log_dist] [Rank 0] step=660, skipped=0, lr=[0.0002769230769230769, 0.0002769230769230769], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 660 loss: 4.6922 iter time (s): 60.793 samples/sec: 16.844 %comms: 0.0029604482486434735 %optimizer_step 0.057762997760932515 %forward: 23.871824330346218 %backward: 64.10085151236457 [2025-03-25 11:02:20,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14365.67 | forward: 145124.39 | backward_microstep: 389695.66 | backward: 389689.39 | backward_inner_microstep: 389673.80 | backward_inner: 389668.03 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.19 | step: 351.16 | _step_clipping: 0.12 | _step_step: 349.60 | _step_zero_grad: 0.46 | _step_check_overflow: 0.44 samples/sec: 16.844 | iteration 660/ 143000 | elapsed time per iteration (ms): 60793.7 | learning rate: 2.769E-04 | approx flops per GPU: 72.7TFLOPS | lm_loss: 4.713138E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 11:12:32,628] [INFO] [logging.py:60:log_dist] [Rank 0] step=670, skipped=0, lr=[0.0002811188811188811, 0.0002811188811188811], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 670 loss: 4.7030 iter time (s): 61.174 samples/sec: 16.739 %comms: 0.0029416082839988896 %optimizer_step 0.0578865763511 %forward: 23.745371201974567 %backward: 63.71336362288006 [2025-03-25 11:12:32,628] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17970.91 | forward: 145259.01 | backward_microstep: 389764.50 | backward: 389757.66 | backward_inner_microstep: 389742.13 | backward_inner: 389736.28 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.35 | comms: 17.99 | reduce_grads: 0.20 | step: 354.11 | _step_clipping: 0.13 | _step_step: 352.41 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.739 | iteration 670/ 143000 | elapsed time per iteration (ms): 61174.2 | learning rate: 2.811E-04 | approx flops per GPU: 72.2TFLOPS | lm_loss: 4.704435E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 11:22:45,340] [INFO] [logging.py:60:log_dist] [Rank 0] step=680, skipped=0, lr=[0.0002853146853146853, 0.0002853146853146853], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 680 loss: 4.6807 iter time (s): 61.271 samples/sec: 16.713 %comms: 0.0029702175229391974 %optimizer_step 0.05777917674370678 %forward: 23.754280143920326 %backward: 63.63206533727387 [2025-03-25 11:22:45,341] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18507.77 | forward: 145544.12 | backward_microstep: 389886.54 | backward: 389878.07 | backward_inner_microstep: 389862.22 | backward_inner: 389856.05 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.62 | reduce_tied_grads: 0.35 | comms: 18.20 | reduce_grads: 0.18 | step: 354.02 | _step_clipping: 0.14 | _step_step: 352.24 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.713 | iteration 680/ 143000 | elapsed time per iteration (ms): 61271.3 | learning rate: 2.853E-04 | approx flops per GPU: 72.1TFLOPS | lm_loss: 4.684703E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 11:32:59,811] [INFO] [logging.py:60:log_dist] [Rank 0] step=690, skipped=0, lr=[0.0002895104895104895, 0.0002895104895104895], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 690 loss: 4.6655 iter time (s): 61.446 samples/sec: 16.665 %comms: 0.0029792598990716623 %optimizer_step 0.05928732243287207 %forward: 23.659425178476244 %backward: 63.45118775855395 [2025-03-25 11:32:59,811] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20417.54 | forward: 145378.76 | backward_microstep: 389893.43 | backward: 389885.01 | backward_inner_microstep: 389868.98 | backward_inner: 389862.82 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.65 | reduce_tied_grads: 0.38 | comms: 18.31 | reduce_grads: 0.51 | step: 364.30 | _step_clipping: 0.16 | _step_step: 362.57 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.665 | iteration 690/ 143000 | elapsed time per iteration (ms): 61447.0 | learning rate: 2.895E-04 | approx flops per GPU: 71.9TFLOPS | lm_loss: 4.677845E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 11:43:10,334] [INFO] [logging.py:60:log_dist] [Rank 0] step=700, skipped=0, lr=[0.0002937062937062937, 0.0002937062937062937], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 700 loss: 4.6553 iter time (s): 61.052 samples/sec: 16.773 %comms: 0.002949157161995716 %optimizer_step 0.05751936249727888 %forward: 23.772316111722162 %backward: 63.83109227231149 [2025-03-25 11:43:10,335] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16951.85 | forward: 145134.25 | backward_microstep: 389707.00 | backward: 389700.25 | backward_inner_microstep: 389682.70 | backward_inner: 389675.13 | backward_allreduce_microstep: 9.51 | backward_allreduce: 4.52 | reduce_tied_grads: 0.35 | comms: 18.01 | reduce_grads: 0.19 | step: 351.17 | _step_clipping: 0.13 | _step_step: 349.47 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.772 | iteration 700/ 143000 | elapsed time per iteration (ms): 61052.4 | learning rate: 2.937E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.659664E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 11:53:23,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=710, skipped=0, lr=[0.0002979020979020979, 0.0002979020979020979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 710 loss: 4.6924 iter time (s): 61.332 samples/sec: 16.696 %comms: 0.002916337685393716 %optimizer_step 0.05682409400953794 %forward: 23.660749500005778 %backward: 63.54206370317254 [2025-03-25 11:53:23,657] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19745.18 | forward: 145115.44 | backward_microstep: 389721.40 | backward: 389714.40 | backward_inner_microstep: 389699.07 | backward_inner: 389693.26 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.89 | reduce_grads: 0.18 | step: 348.51 | _step_clipping: 0.13 | _step_step: 346.89 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.696 | iteration 710/ 143000 | elapsed time per iteration (ms): 61332.2 | learning rate: 2.979E-04 | approx flops per GPU: 72.0TFLOPS | lm_loss: 4.639323E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 12:03:37,168] [INFO] [logging.py:60:log_dist] [Rank 0] step=720, skipped=0, lr=[0.00030209790209790205, 0.00030209790209790205], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 720 loss: 4.6068 iter time (s): 61.351 samples/sec: 16.691 %comms: 0.002981390652932333 %optimizer_step 0.05719934644192384 %forward: 23.678905708086575 %backward: 63.55621790355733 [2025-03-25 12:03:37,169] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19525.96 | forward: 145271.40 | backward_microstep: 389929.50 | backward: 389920.91 | backward_inner_microstep: 389905.03 | backward_inner: 389898.77 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.62 | reduce_tied_grads: 0.41 | comms: 18.29 | reduce_grads: 0.19 | step: 350.92 | _step_clipping: 0.14 | _step_step: 349.11 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.691 | iteration 720/ 143000 | elapsed time per iteration (ms): 61351.2 | learning rate: 3.021E-04 | approx flops per GPU: 72.0TFLOPS | lm_loss: 4.620327E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 12:13:50,329] [INFO] [logging.py:60:log_dist] [Rank 0] step=730, skipped=0, lr=[0.00030629370629370626, 0.00030629370629370626], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 730 loss: 4.6006 iter time (s): 61.316 samples/sec: 16.700 %comms: 0.0029365105366358883 %optimizer_step 0.056996300803564326 %forward: 23.68571737792698 %backward: 63.58739987889655 [2025-03-25 12:13:50,330] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19260.93 | forward: 145230.24 | backward_microstep: 389896.38 | backward: 389889.54 | backward_inner_microstep: 389873.65 | backward_inner: 389867.64 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.80 | reduce_tied_grads: 0.52 | comms: 18.01 | reduce_grads: 0.19 | step: 349.48 | _step_clipping: 0.13 | _step_step: 347.77 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.700 | iteration 730/ 143000 | elapsed time per iteration (ms): 61316.1 | learning rate: 3.063E-04 | approx flops per GPU: 72.0TFLOPS | lm_loss: 4.614421E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 12:24:00,997] [INFO] [logging.py:60:log_dist] [Rank 0] step=740, skipped=0, lr=[0.0003104895104895105, 0.0003104895104895105], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 740 loss: 4.6134 iter time (s): 61.066 samples/sec: 16.769 %comms: 0.0029372540152249114 %optimizer_step 0.05738225878366347 %forward: 23.753977624530517 %backward: 63.840034279041205 [2025-03-25 12:24:00,998] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17007.45 | forward: 145056.62 | backward_microstep: 389853.48 | backward: 389847.10 | backward_inner_microstep: 389831.72 | backward_inner: 389825.88 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.57 | reduce_tied_grads: 0.34 | comms: 17.94 | reduce_grads: 0.18 | step: 350.41 | _step_clipping: 0.14 | _step_step: 348.76 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.769 | iteration 740/ 143000 | elapsed time per iteration (ms): 61066.8 | learning rate: 3.105E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.598652E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 12:34:11,575] [INFO] [logging.py:60:log_dist] [Rank 0] step=750, skipped=0, lr=[0.00031468531468531463, 0.00031468531468531463], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 750 loss: 4.5704 iter time (s): 61.057 samples/sec: 16.771 %comms: 0.002954049611279368 %optimizer_step 0.05952391809227441 %forward: 23.834610452063547 %backward: 63.87633960576691 [2025-03-25 12:34:11,576] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16247.96 | forward: 145527.49 | backward_microstep: 390018.85 | backward: 390011.13 | backward_inner_microstep: 389994.81 | backward_inner: 389988.65 | backward_allreduce_microstep: 8.01 | backward_allreduce: 2.68 | reduce_tied_grads: 0.39 | comms: 18.04 | reduce_grads: 0.20 | step: 363.44 | _step_clipping: 0.15 | _step_step: 361.74 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.771 | iteration 750/ 143000 | elapsed time per iteration (ms): 61057.8 | learning rate: 3.147E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.584679E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 12:44:22,350] [INFO] [logging.py:60:log_dist] [Rank 0] step=760, skipped=0, lr=[0.00031888111888111884, 0.00031888111888111884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 760 loss: 4.5901 iter time (s): 61.077 samples/sec: 16.766 %comms: 0.0029382219030333785 %optimizer_step 0.05904061608596265 %forward: 23.8094344269201 %backward: 63.83011636283493 [2025-03-25 12:44:22,351] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16705.11 | forward: 145420.79 | backward_microstep: 389861.83 | backward: 389854.95 | backward_inner_microstep: 389839.09 | backward_inner: 389833.22 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.65 | reduce_tied_grads: 0.32 | comms: 17.95 | reduce_grads: 0.19 | step: 360.60 | _step_clipping: 0.14 | _step_step: 358.78 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.766 | iteration 760/ 143000 | elapsed time per iteration (ms): 61077.5 | learning rate: 3.189E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.573838E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 12:54:33,323] [INFO] [logging.py:60:log_dist] [Rank 0] step=770, skipped=0, lr=[0.00032307692307692305, 0.00032307692307692305], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 770 loss: 4.5532 iter time (s): 61.097 samples/sec: 16.760 %comms: 0.0029393778740936494 %optimizer_step 0.05870334557793407 %forward: 23.787894029010676 %backward: 63.81698985838694 [2025-03-25 12:54:33,324] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16920.15 | forward: 145336.28 | backward_microstep: 389910.88 | backward: 389901.01 | backward_inner_microstep: 389884.98 | backward_inner: 389878.89 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.65 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.21 | step: 358.66 | _step_clipping: 0.13 | _step_step: 357.04 | _step_zero_grad: 0.45 | _step_check_overflow: 0.45 samples/sec: 16.760 | iteration 770/ 143000 | elapsed time per iteration (ms): 61097.3 | learning rate: 3.231E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.563704E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 13:04:52,130] [INFO] [logging.py:60:log_dist] [Rank 0] step=780, skipped=0, lr=[0.00032727272727272726, 0.00032727272727272726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 780 loss: 4.5229 iter time (s): 61.880 samples/sec: 16.548 %comms: 0.002946318929081258 %optimizer_step 0.057904702904334096 %forward: 23.47277591060793 %backward: 63.03208942093285 [2025-03-25 13:04:52,131] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24682.77 | forward: 145249.91 | backward_microstep: 390055.61 | backward: 390043.58 | backward_inner_microstep: 390025.57 | backward_inner: 390019.30 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.60 | reduce_tied_grads: 0.38 | comms: 18.23 | reduce_grads: 0.23 | step: 358.32 | _step_clipping: 0.16 | _step_step: 356.48 | _step_zero_grad: 0.49 | _step_check_overflow: 0.62 samples/sec: 16.548 | iteration 780/ 143000 | elapsed time per iteration (ms): 61880.7 | learning rate: 3.273E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 4.550867E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 13:14:59,537] [INFO] [logging.py:60:log_dist] [Rank 0] step=790, skipped=0, lr=[0.0003314685314685315, 0.0003314685314685315], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 790 loss: 4.5287 iter time (s): 60.740 samples/sec: 16.859 %comms: 0.002977010665429704 %optimizer_step 0.05776608876109343 %forward: 23.892070711190787 %backward: 64.18690817475546 [2025-03-25 13:14:59,538] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13631.79 | forward: 145120.58 | backward_microstep: 389881.27 | backward: 389871.66 | backward_inner_microstep: 389855.92 | backward_inner: 389849.68 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.57 | reduce_tied_grads: 0.38 | comms: 18.08 | reduce_grads: 0.18 | step: 350.87 | _step_clipping: 0.14 | _step_step: 349.17 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.859 | iteration 790/ 143000 | elapsed time per iteration (ms): 60740.7 | learning rate: 3.315E-04 | approx flops per GPU: 72.7TFLOPS | lm_loss: 4.531150E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 13:25:06,344] [INFO] [logging.py:60:log_dist] [Rank 0] step=800, skipped=0, lr=[0.00033566433566433563, 0.00033566433566433563], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 800 loss: 4.5449 iter time (s): 60.680 samples/sec: 16.875 %comms: 0.003004284505288444 %optimizer_step 0.05902270570821097 %forward: 23.907211097869556 %backward: 64.23595984322657 [2025-03-25 13:25:06,344] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13162.35 | forward: 145068.66 | backward_microstep: 389790.21 | backward: 389783.01 | backward_inner_microstep: 389767.67 | backward_inner: 389761.60 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.52 | reduce_tied_grads: 0.35 | comms: 18.23 | reduce_grads: 0.19 | step: 358.15 | _step_clipping: 0.13 | _step_step: 356.45 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.875 | iteration 800/ 143000 | elapsed time per iteration (ms): 60680.7 | learning rate: 3.357E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 4.522162E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 13:35:12,946] [INFO] [logging.py:60:log_dist] [Rank 0] step=810, skipped=0, lr=[0.0003398601398601398, 0.0003398601398601398], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 810 loss: 4.4975 iter time (s): 60.660 samples/sec: 16.881 %comms: 0.0029613454610863954 %optimizer_step 0.05788128617004823 %forward: 23.915336526213004 %backward: 64.25743001090778 [2025-03-25 13:35:12,946] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12980.33 | forward: 145069.53 | backward_microstep: 389790.49 | backward: 389783.14 | backward_inner_microstep: 389767.84 | backward_inner: 389761.84 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.48 | reduce_tied_grads: 0.33 | comms: 17.96 | reduce_grads: 0.18 | step: 351.11 | _step_clipping: 0.13 | _step_step: 349.41 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.881 | iteration 810/ 143000 | elapsed time per iteration (ms): 60660.2 | learning rate: 3.399E-04 | approx flops per GPU: 72.8TFLOPS | lm_loss: 4.512955E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 13:45:25,390] [INFO] [logging.py:60:log_dist] [Rank 0] step=820, skipped=0, lr=[0.000344055944055944, 0.000344055944055944], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 820 loss: 4.4876 iter time (s): 61.244 samples/sec: 16.720 %comms: 0.0029276436731857695 %optimizer_step 0.056263320948542334 %forward: 23.697340200180403 %backward: 63.652407409929666 [2025-03-25 13:45:25,391] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18705.13 | forward: 145131.75 | backward_microstep: 389839.99 | backward: 389832.15 | backward_inner_microstep: 389816.51 | backward_inner: 389810.51 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.60 | reduce_tied_grads: 0.33 | comms: 17.93 | reduce_grads: 0.19 | step: 344.58 | _step_clipping: 0.12 | _step_step: 342.94 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.720 | iteration 820/ 143000 | elapsed time per iteration (ms): 61244.4 | learning rate: 3.441E-04 | approx flops per GPU: 72.1TFLOPS | lm_loss: 4.496007E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 13:55:34,043] [INFO] [logging.py:60:log_dist] [Rank 0] step=830, skipped=0, lr=[0.0003482517482517482, 0.0003482517482517482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 830 loss: 4.4591 iter time (s): 60.865 samples/sec: 16.824 %comms: 0.0029368292494041243 %optimizer_step 0.056536695285835874 %forward: 23.850921879585556 %backward: 64.0462650185478 [2025-03-25 13:55:34,044] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14905.28 | forward: 145168.19 | backward_microstep: 389823.10 | backward: 389816.39 | backward_inner_microstep: 389801.04 | backward_inner: 389795.16 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 17.87 | reduce_grads: 0.19 | step: 344.11 | _step_clipping: 0.13 | _step_step: 342.51 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.824 | iteration 830/ 143000 | elapsed time per iteration (ms): 60865.3 | learning rate: 3.483E-04 | approx flops per GPU: 72.6TFLOPS | lm_loss: 4.483524E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 14:05:42,973] [INFO] [logging.py:60:log_dist] [Rank 0] step=840, skipped=0, lr=[0.0003524475524475525, 0.0003524475524475525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 840 loss: 4.4671 iter time (s): 60.892 samples/sec: 16.817 %comms: 0.002940470771422178 %optimizer_step 0.05634772064361193 %forward: 23.837543533317383 %backward: 64.02970981346733 [2025-03-25 14:05:42,973] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15096.58 | forward: 145152.55 | backward_microstep: 389899.81 | backward: 389892.34 | backward_inner_microstep: 389876.74 | backward_inner: 389870.71 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 343.11 | _step_clipping: 0.11 | _step_step: 341.56 | _step_zero_grad: 0.46 | _step_check_overflow: 0.45 samples/sec: 16.816 | iteration 840/ 143000 | elapsed time per iteration (ms): 60892.9 | learning rate: 3.524E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 4.468191E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 14:15:53,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=850, skipped=0, lr=[0.00035664335664335663, 0.00035664335664335663], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 850 loss: 4.4269 iter time (s): 61.002 samples/sec: 16.786 %comms: 0.0030139209096516776 %optimizer_step 0.057762459629357006 %forward: 23.79709304379417 %backward: 63.92376551908093 [2025-03-25 14:15:53,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16120.63 | forward: 145168.04 | backward_microstep: 389958.32 | backward: 389950.48 | backward_inner_microstep: 389934.64 | backward_inner: 389928.54 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.71 | reduce_tied_grads: 0.32 | comms: 18.39 | reduce_grads: 0.20 | step: 352.37 | _step_clipping: 0.12 | _step_step: 350.57 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.786 | iteration 850/ 143000 | elapsed time per iteration (ms): 61003.0 | learning rate: 3.566E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.456855E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 14:26:03,326] [INFO] [logging.py:60:log_dist] [Rank 0] step=860, skipped=0, lr=[0.0003608391608391608, 0.0003608391608391608], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 860 loss: 4.4399 iter time (s): 61.032 samples/sec: 16.778 %comms: 0.002973522076603469 %optimizer_step 0.05682517960430979 %forward: 23.796415540197966 %backward: 63.896492109817295 [2025-03-25 14:26:03,327] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16330.13 | forward: 145233.85 | backward_microstep: 389980.44 | backward: 389971.90 | backward_inner_microstep: 389955.95 | backward_inner: 389949.69 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.63 | reduce_tied_grads: 0.31 | comms: 18.15 | reduce_grads: 0.19 | step: 346.81 | _step_clipping: 0.12 | _step_step: 345.15 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.778 | iteration 860/ 143000 | elapsed time per iteration (ms): 61032.4 | learning rate: 3.608E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.441640E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 14:36:13,378] [INFO] [logging.py:60:log_dist] [Rank 0] step=870, skipped=0, lr=[0.00036503496503496495, 0.00036503496503496495], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 870 loss: 4.4158 iter time (s): 61.005 samples/sec: 16.786 %comms: 0.003013105717821275 %optimizer_step 0.057367935194752064 %forward: 23.81385075005707 %backward: 63.936484203579 [2025-03-25 14:36:13,379] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15921.04 | forward: 145275.65 | backward_microstep: 390051.14 | backward: 390042.52 | backward_inner_microstep: 390026.48 | backward_inner: 390020.18 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 18.38 | reduce_grads: 0.19 | step: 349.97 | _step_clipping: 0.12 | _step_step: 348.27 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.785 | iteration 870/ 143000 | elapsed time per iteration (ms): 61005.2 | learning rate: 3.650E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.429484E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 14:46:23,927] [INFO] [logging.py:60:log_dist] [Rank 0] step=880, skipped=0, lr=[0.00036923076923076916, 0.00036923076923076916], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 880 loss: 4.4039 iter time (s): 61.054 samples/sec: 16.772 %comms: 0.002998746176004574 %optimizer_step 0.05868599218568967 %forward: 23.794279273816343 %backward: 63.898498800883644 [2025-03-25 14:46:23,928] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16326.74 | forward: 145274.34 | backward_microstep: 390136.64 | backward: 390127.91 | backward_inner_microstep: 390110.22 | backward_inner: 390103.92 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.62 | reduce_tied_grads: 0.30 | comms: 18.31 | reduce_grads: 0.19 | step: 358.30 | _step_clipping: 0.12 | _step_step: 356.57 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.772 | iteration 880/ 143000 | elapsed time per iteration (ms): 61054.9 | learning rate: 3.692E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.415333E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 14:56:39,311] [INFO] [logging.py:60:log_dist] [Rank 0] step=890, skipped=0, lr=[0.00037342657342657337, 0.00037342657342657337], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 890 loss: 4.4120 iter time (s): 61.538 samples/sec: 16.640 %comms: 0.0029415961042656524 %optimizer_step 0.05710892357429689 %forward: 23.613310026035855 %backward: 63.37225581623124 [2025-03-25 14:56:39,311] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21298.90 | forward: 145311.08 | backward_microstep: 389987.00 | backward: 389978.82 | backward_inner_microstep: 389962.76 | backward_inner: 389956.40 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.63 | reduce_tied_grads: 0.38 | comms: 18.10 | reduce_grads: 0.20 | step: 351.44 | _step_clipping: 0.13 | _step_step: 349.70 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.640 | iteration 890/ 143000 | elapsed time per iteration (ms): 61538.4 | learning rate: 3.734E-04 | approx flops per GPU: 71.8TFLOPS | lm_loss: 4.419804E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 15:06:49,794] [INFO] [logging.py:60:log_dist] [Rank 0] step=900, skipped=0, lr=[0.0003776223776223776, 0.0003776223776223776], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 900 loss: 4.3837 iter time (s): 61.048 samples/sec: 16.774 %comms: 0.002936698698576401 %optimizer_step 0.056758834667294984 %forward: 23.807242795836874 %backward: 63.89948275427292 [2025-03-25 15:06:49,794] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16231.60 | forward: 145337.86 | backward_microstep: 390100.30 | backward: 390091.97 | backward_inner_microstep: 390074.28 | backward_inner: 390067.98 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.63 | reduce_tied_grads: 0.34 | comms: 17.93 | reduce_grads: 0.18 | step: 346.50 | _step_clipping: 0.12 | _step_step: 344.85 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.774 | iteration 900/ 143000 | elapsed time per iteration (ms): 61048.3 | learning rate: 3.776E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.398434E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 15:17:02,072] [INFO] [logging.py:60:log_dist] [Rank 0] step=910, skipped=0, lr=[0.0003818181818181818, 0.0003818181818181818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 910 loss: 4.3705 iter time (s): 61.227 samples/sec: 16.725 %comms: 0.003014341915632955 %optimizer_step 0.058104175955663126 %forward: 23.747676289338045 %backward: 63.7119157583107 [2025-03-25 15:17:02,072] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17964.69 | forward: 145400.46 | backward_microstep: 390099.87 | backward: 390090.45 | backward_inner_microstep: 390074.61 | backward_inner: 390068.32 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.60 | reduce_tied_grads: 0.55 | comms: 18.46 | reduce_grads: 0.21 | step: 355.76 | _step_clipping: 0.13 | _step_step: 353.97 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.724 | iteration 910/ 143000 | elapsed time per iteration (ms): 61227.8 | learning rate: 3.818E-04 | approx flops per GPU: 72.1TFLOPS | lm_loss: 4.387000E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 15:27:15,287] [INFO] [logging.py:60:log_dist] [Rank 0] step=920, skipped=0, lr=[0.00038601398601398595, 0.00038601398601398595], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 920 loss: 4.3336 iter time (s): 61.321 samples/sec: 16.699 %comms: 0.0029484607257202755 %optimizer_step 0.05981152077758566 %forward: 23.696515709667526 %backward: 63.610380617379846 [2025-03-25 15:27:15,287] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19020.75 | forward: 145309.23 | backward_microstep: 390072.86 | backward: 390064.75 | backward_inner_microstep: 390048.79 | backward_inner: 390042.57 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.65 | reduce_tied_grads: 0.41 | comms: 18.08 | reduce_grads: 0.20 | step: 366.77 | _step_clipping: 0.15 | _step_step: 364.99 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 16.699 | iteration 920/ 143000 | elapsed time per iteration (ms): 61321.5 | learning rate: 3.860E-04 | approx flops per GPU: 72.0TFLOPS | lm_loss: 4.362415E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 15:37:24,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=930, skipped=0, lr=[0.00039020979020979016, 0.00039020979020979016], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 930 loss: 4.3597 iter time (s): 60.968 samples/sec: 16.796 %comms: 0.0029522125665096374 %optimizer_step 0.05693656716481093 %forward: 23.82303997154213 %backward: 63.97786025076183 [2025-03-25 15:37:24,969] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15569.73 | forward: 145243.38 | backward_microstep: 390065.95 | backward: 390057.71 | backward_inner_microstep: 390041.77 | backward_inner: 390035.58 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.68 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.19 | step: 347.13 | _step_clipping: 0.13 | _step_step: 345.45 | _step_zero_grad: 0.49 | _step_check_overflow: 0.49 samples/sec: 16.796 | iteration 930/ 143000 | elapsed time per iteration (ms): 60968.2 | learning rate: 3.902E-04 | approx flops per GPU: 72.5TFLOPS | lm_loss: 4.353299E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 15:47:34,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=940, skipped=0, lr=[0.00039440559440559437, 0.00039440559440559437], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 940 loss: 4.3288 iter time (s): 60.979 samples/sec: 16.793 %comms: 0.0029446889278687965 %optimizer_step 0.05677633159107984 %forward: 23.824586556711242 %backward: 63.971159819469 [2025-03-25 15:47:34,767] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15632.10 | forward: 145280.57 | backward_microstep: 390099.30 | backward: 390091.40 | backward_inner_microstep: 390075.43 | backward_inner: 390069.07 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.19 | step: 346.22 | _step_clipping: 0.12 | _step_step: 344.61 | _step_zero_grad: 0.49 | _step_check_overflow: 0.45 samples/sec: 16.792 | iteration 940/ 143000 | elapsed time per iteration (ms): 60979.8 | learning rate: 3.944E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.354015E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 15:57:45,338] [INFO] [logging.py:60:log_dist] [Rank 0] step=950, skipped=0, lr=[0.0003986013986013986, 0.0003986013986013986], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 950 loss: 4.3557 iter time (s): 61.057 samples/sec: 16.771 %comms: 0.002952246401144805 %optimizer_step 0.05736856251311955 %forward: 23.81250462106085 %backward: 63.89805291512748 [2025-03-25 15:57:45,338] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16218.43 | forward: 145390.94 | backward_microstep: 390147.82 | backward: 390139.47 | backward_inner_microstep: 390123.37 | backward_inner: 390117.05 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.64 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.20 | step: 350.27 | _step_clipping: 0.13 | _step_step: 348.61 | _step_zero_grad: 0.50 | _step_check_overflow: 0.45 samples/sec: 16.771 | iteration 950/ 143000 | elapsed time per iteration (ms): 61057.1 | learning rate: 3.986E-04 | approx flops per GPU: 72.3TFLOPS | lm_loss: 4.350081E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 16:07:55,429] [INFO] [logging.py:60:log_dist] [Rank 0] step=960, skipped=0, lr=[0.0004027972027972028, 0.0004027972027972028], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 960 loss: 4.3143 iter time (s): 61.009 samples/sec: 16.785 %comms: 0.00294827482985805 %optimizer_step 0.05725381443250978 %forward: 23.82519804487005 %backward: 63.94549356674888 [2025-03-25 16:07:55,430] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15798.89 | forward: 145354.20 | backward_microstep: 390131.13 | backward: 390122.52 | backward_inner_microstep: 390104.42 | backward_inner: 390098.16 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.20 | step: 349.30 | _step_clipping: 0.11 | _step_step: 347.67 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.784 | iteration 960/ 143000 | elapsed time per iteration (ms): 61009.1 | learning rate: 4.028E-04 | approx flops per GPU: 72.4TFLOPS | lm_loss: 4.331599E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 16:18:11,524] [INFO] [logging.py:60:log_dist] [Rank 0] step=970, skipped=0, lr=[0.00040699300699300695, 0.00040699300699300695], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 970 loss: 4.3143 iter time (s): 61.609 samples/sec: 16.621 %comms: 0.0029292191941959673 %optimizer_step 0.059470003178427086 %forward: 23.603367275714763 %backward: 63.319162984746214 [2025-03-25 16:18:11,525] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21681.80 | forward: 145417.91 | backward_microstep: 390110.91 | backward: 390102.84 | backward_inner_microstep: 390086.57 | backward_inner: 390080.33 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.69 | reduce_tied_grads: 0.36 | comms: 18.05 | reduce_grads: 0.23 | step: 366.39 | _step_clipping: 0.15 | _step_step: 364.64 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.621 | iteration 970/ 143000 | elapsed time per iteration (ms): 61609.5 | learning rate: 4.070E-04 | approx flops per GPU: 71.7TFLOPS | lm_loss: 4.325607E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 16:28:23,337] [INFO] [logging.py:60:log_dist] [Rank 0] step=980, skipped=0, lr=[0.00041118881118881116, 0.00041118881118881116], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 980 loss: 4.3127 iter time (s): 61.181 samples/sec: 16.737 %comms: 0.00295062127962269 %optimizer_step 0.05775561742919754 %forward: 23.73433278301599 %backward: 63.757711721852125 [2025-03-25 16:28:23,337] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17323.81 | forward: 145208.26 | backward_microstep: 390083.89 | backward: 390074.01 | backward_inner_microstep: 390058.36 | backward_inner: 390052.31 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.61 | reduce_tied_grads: 0.34 | comms: 18.05 | reduce_grads: 0.19 | step: 353.35 | _step_clipping: 0.15 | _step_step: 351.65 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.737 | iteration 980/ 143000 | elapsed time per iteration (ms): 61181.3 | learning rate: 4.112E-04 | approx flops per GPU: 72.2TFLOPS | lm_loss: 4.299503E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 16:38:39,665] [INFO] [logging.py:60:log_dist] [Rank 0] step=990, skipped=0, lr=[0.00041538461538461537, 0.00041538461538461537], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 990 loss: 4.2816 iter time (s): 61.632 samples/sec: 16.615 %comms: 0.0029374761908386605 %optimizer_step 0.058790645001877063 %forward: 23.5716938069409 %backward: 63.25960054558165 [2025-03-25 16:38:39,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20916.79 | forward: 145277.56 | backward_microstep: 389892.09 | backward: 389882.90 | backward_inner_microstep: 389865.35 | backward_inner: 389859.34 | backward_allreduce_microstep: 9.33 | backward_allreduce: 4.30 | reduce_tied_grads: 0.45 | comms: 18.10 | reduce_grads: 0.22 | step: 362.34 | _step_clipping: 0.14 | _step_step: 360.55 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.615 | iteration 990/ 143000 | elapsed time per iteration (ms): 61632.8 | learning rate: 4.154E-04 | approx flops per GPU: 71.7TFLOPS | lm_loss: 4.291681E+00 | loss scale: 4096.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 16:49:01,819] [INFO] [logging.py:60:log_dist] [Rank 0] step=1000, skipped=0, lr=[0.0004195804195804196, 0.0004195804195804196], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1000 loss: 4.3261 iter time (s): 62.215 samples/sec: 16.459 %comms: 0.0029437657235926867 %optimizer_step 0.057135543065975035 %forward: 23.357415649289745 %backward: 62.685514215522744 [2025-03-25 16:49:01,820] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24952.74 | forward: 145317.86 | backward_microstep: 390006.28 | backward: 389997.11 | backward_inner_microstep: 389980.95 | backward_inner: 389974.66 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.65 | reduce_tied_grads: 0.43 | comms: 18.31 | reduce_grads: 0.19 | step: 355.47 | _step_clipping: 0.15 | _step_step: 353.59 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.459 | iteration 1000/ 143000 | elapsed time per iteration (ms): 62215.5 | learning rate: 4.196E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 4.298421E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 16:49:04,692] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step1000/mp_rank_00_model_states.pt [2025-03-25 16:49:18,626] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-25 16:49:18,633] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-25 16:59:42,919] [INFO] [logging.py:60:log_dist] [Rank 0] step=1010, skipped=0, lr=[0.0004237762237762238, 0.0004237762237762238], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1010 loss: 4.2700 iter time (s): 62.427 samples/sec: 16.403 %comms: 0.0029491443296042344 %optimizer_step 0.057707559259807126 %forward: 23.30769663783973 %backward: 62.49186055069801 [2025-03-25 16:59:42,919] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25846.46 | forward: 145503.43 | backward_microstep: 390129.50 | backward: 390119.20 | backward_inner_microstep: 390102.70 | backward_inner: 390096.26 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.70 | reduce_tied_grads: 0.39 | comms: 18.41 | reduce_grads: 0.20 | step: 360.25 | _step_clipping: 0.16 | _step_step: 358.32 | _step_zero_grad: 0.54 | _step_check_overflow: 0.61 samples/sec: 15.973 | iteration 1010/ 143000 | elapsed time per iteration (ms): 64109.9 | learning rate: 4.238E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 4.274308E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 17:09:55,832] [INFO] [logging.py:60:log_dist] [Rank 0] step=1020, skipped=0, lr=[0.00042797202797202795, 0.00042797202797202795], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1020 loss: 4.1982 iter time (s): 61.291 samples/sec: 16.707 %comms: 0.0029845714820768226 %optimizer_step 0.05759524712664985 %forward: 23.69198292760228 %backward: 63.63043892027412 [2025-03-25 17:09:55,833] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14283.72 | forward: 145209.96 | backward_microstep: 390004.80 | backward: 389995.80 | backward_inner_microstep: 389979.92 | backward_inner: 389973.50 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.59 | reduce_tied_grads: 0.37 | comms: 18.29 | reduce_grads: 0.19 | step: 353.01 | _step_clipping: 0.14 | _step_step: 351.16 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.707 | iteration 1020/ 143000 | elapsed time per iteration (ms): 61291.3 | learning rate: 4.280E-04 | approx flops per GPU: 72.1TFLOPS | lm_loss: 4.229005E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 17:20:09,914] [INFO] [logging.py:60:log_dist] [Rank 0] step=1030, skipped=0, lr=[0.00043216783216783216, 0.00043216783216783216], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1030 loss: 4.2043 iter time (s): 61.408 samples/sec: 16.675 %comms: 0.002959166950393757 %optimizer_step 0.06030264079111118 %forward: 23.653212941752756 %backward: 63.51013957058827 [2025-03-25 17:20:09,915] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14818.65 | forward: 145248.82 | backward_microstep: 390010.22 | backward: 390000.84 | backward_inner_microstep: 389984.64 | backward_inner: 389977.95 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.19 | step: 370.30 | _step_clipping: 0.15 | _step_step: 368.57 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.675 | iteration 1030/ 143000 | elapsed time per iteration (ms): 61408.2 | learning rate: 4.322E-04 | approx flops per GPU: 71.9TFLOPS | lm_loss: 4.195944E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 17:30:25,629] [INFO] [logging.py:60:log_dist] [Rank 0] step=1040, skipped=0, lr=[0.00043636363636363637, 0.00043636363636363637], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1040 loss: 4.1809 iter time (s): 61.571 samples/sec: 16.631 %comms: 0.0029433056253551798 %optimizer_step 0.05803473805931547 %forward: 23.607364768997282 %backward: 63.33672656600555 [2025-03-25 17:30:25,630] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15992.93 | forward: 145352.66 | backward_microstep: 389978.24 | backward: 389969.89 | backward_inner_microstep: 389953.45 | backward_inner: 389947.16 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.91 | reduce_tied_grads: 0.28 | comms: 18.12 | reduce_grads: 0.18 | step: 357.33 | _step_clipping: 0.13 | _step_step: 355.57 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.631 | iteration 1040/ 143000 | elapsed time per iteration (ms): 61571.5 | learning rate: 4.364E-04 | approx flops per GPU: 71.7TFLOPS | lm_loss: 4.182439E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 17:40:39,548] [INFO] [logging.py:60:log_dist] [Rank 0] step=1050, skipped=0, lr=[0.00044055944055944047, 0.00044055944055944047], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1050 loss: 4.1838 iter time (s): 61.391 samples/sec: 16.680 %comms: 0.0029410003282915064 %optimizer_step 0.05736152609825512 %forward: 23.659619585364904 %backward: 63.509023327383986 [2025-03-25 17:40:39,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14124.53 | forward: 145249.62 | backward_microstep: 389898.82 | backward: 389890.52 | backward_inner_microstep: 389874.25 | backward_inner: 389868.11 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.61 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.19 | step: 352.15 | _step_clipping: 0.12 | _step_step: 350.47 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.680 | iteration 1050/ 143000 | elapsed time per iteration (ms): 61391.9 | learning rate: 4.406E-04 | approx flops per GPU: 72.0TFLOPS | lm_loss: 4.186125E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 17:50:59,615] [INFO] [logging.py:60:log_dist] [Rank 0] step=1060, skipped=0, lr=[0.0004447552447552447, 0.0004447552447552447], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1060 loss: 4.1724 iter time (s): 62.006 samples/sec: 16.514 %comms: 0.0029250686483597065 %optimizer_step 0.057878815864337775 %forward: 23.427706226628466 %backward: 62.891527266485944 [2025-03-25 17:50:59,616] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20011.78 | forward: 145266.12 | backward_microstep: 389974.17 | backward: 389965.98 | backward_inner_microstep: 389950.17 | backward_inner: 389944.11 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.59 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.19 | step: 358.88 | _step_clipping: 0.13 | _step_step: 357.25 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.514 | iteration 1060/ 143000 | elapsed time per iteration (ms): 62006.7 | learning rate: 4.448E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 4.161823E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 18:01:15,815] [INFO] [logging.py:60:log_dist] [Rank 0] step=1070, skipped=0, lr=[0.0004489510489510489, 0.0004489510489510489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1070 loss: 4.1280 iter time (s): 61.619 samples/sec: 16.618 %comms: 0.002923965412354712 %optimizer_step 0.05637468770138004 %forward: 23.580982560840372 %backward: 63.27316180605982 [2025-03-25 18:01:15,815] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16104.48 | forward: 145304.54 | backward_microstep: 389893.12 | backward: 389885.27 | backward_inner_microstep: 389869.37 | backward_inner: 389863.36 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 347.38 | _step_clipping: 0.12 | _step_step: 345.71 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.618 | iteration 1070/ 143000 | elapsed time per iteration (ms): 61620.0 | learning rate: 4.490E-04 | approx flops per GPU: 71.7TFLOPS | lm_loss: 4.147712E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 18:11:37,539] [INFO] [logging.py:60:log_dist] [Rank 0] step=1080, skipped=0, lr=[0.0004531468531468531, 0.0004531468531468531], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1080 loss: 4.1143 iter time (s): 62.172 samples/sec: 16.470 %comms: 0.0029044265049474054 %optimizer_step 0.05631549464176356 %forward: 23.415358667871843 %backward: 62.750441311056285 [2025-03-25 18:11:37,539] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20967.70 | forward: 145577.53 | backward_microstep: 390139.31 | backward: 390130.87 | backward_inner_microstep: 390114.66 | backward_inner: 390108.49 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.66 | reduce_tied_grads: 0.34 | comms: 18.06 | reduce_grads: 0.19 | step: 350.12 | _step_clipping: 0.15 | _step_step: 348.38 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.470 | iteration 1080/ 143000 | elapsed time per iteration (ms): 62172.4 | learning rate: 4.531E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 4.122533E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 18:21:56,726] [INFO] [logging.py:60:log_dist] [Rank 0] step=1090, skipped=0, lr=[0.00045734265734265726, 0.00045734265734265726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1090 loss: 4.1379 iter time (s): 61.918 samples/sec: 16.538 %comms: 0.003005423219816856 %optimizer_step 0.05700646964170628 %forward: 23.49721989861727 %backward: 62.9904738153093 [2025-03-25 18:21:56,727] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18561.87 | forward: 145490.61 | backward_microstep: 390034.12 | backward: 390025.83 | backward_inner_microstep: 390009.70 | backward_inner: 390003.50 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.66 | reduce_tied_grads: 0.31 | comms: 18.61 | reduce_grads: 0.20 | step: 352.97 | _step_clipping: 0.13 | _step_step: 351.28 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.538 | iteration 1090/ 143000 | elapsed time per iteration (ms): 61918.8 | learning rate: 4.573E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 4.117097E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 18:32:11,533] [INFO] [logging.py:60:log_dist] [Rank 0] step=1100, skipped=0, lr=[0.0004615384615384615, 0.0004615384615384615], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1100 loss: 4.1463 iter time (s): 61.480 samples/sec: 16.656 %comms: 0.002920390228679647 %optimizer_step 0.05624303870394795 %forward: 23.61407788359372 %backward: 63.41309251507064 [2025-03-25 18:32:11,533] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14663.59 | forward: 145179.58 | backward_microstep: 389873.41 | backward: 389864.32 | backward_inner_microstep: 389848.81 | backward_inner: 389842.93 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.18 | step: 345.78 | _step_clipping: 0.12 | _step_step: 344.15 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.656 | iteration 1100/ 143000 | elapsed time per iteration (ms): 61480.6 | learning rate: 4.615E-04 | approx flops per GPU: 71.9TFLOPS | lm_loss: 4.116048E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 18:42:31,682] [INFO] [logging.py:60:log_dist] [Rank 0] step=1110, skipped=0, lr=[0.0004657342657342657, 0.0004657342657342657], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1110 loss: 4.1380 iter time (s): 62.014 samples/sec: 16.512 %comms: 0.0029729300292164074 %optimizer_step 0.05672045851921821 %forward: 23.437449315936384 %backward: 62.891211192135486 [2025-03-25 18:42:31,683] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19630.91 | forward: 145345.81 | backward_microstep: 390024.72 | backward: 390015.75 | backward_inner_microstep: 389999.55 | backward_inner: 389993.23 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.72 | reduce_tied_grads: 0.36 | comms: 18.44 | reduce_grads: 0.21 | step: 351.75 | _step_clipping: 0.15 | _step_step: 349.82 | _step_zero_grad: 0.50 | _step_check_overflow: 0.67 samples/sec: 16.512 | iteration 1110/ 143000 | elapsed time per iteration (ms): 62014.9 | learning rate: 4.657E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 4.131181E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 18:52:56,010] [INFO] [logging.py:60:log_dist] [Rank 0] step=1120, skipped=0, lr=[0.0004699300699300699, 0.0004699300699300699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1120 loss: 4.0971 iter time (s): 62.432 samples/sec: 16.402 %comms: 0.0029280173459241042 %optimizer_step 0.05641175432482164 %forward: 23.276528894303787 %backward: 62.49786947528169 [2025-03-25 18:52:56,011] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23571.85 | forward: 145320.58 | backward_microstep: 390198.79 | backward: 390188.19 | backward_inner_microstep: 390171.66 | backward_inner: 390165.25 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.72 | reduce_tied_grads: 0.35 | comms: 18.28 | reduce_grads: 0.18 | step: 352.19 | _step_clipping: 0.14 | _step_step: 350.41 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.402 | iteration 1120/ 143000 | elapsed time per iteration (ms): 62432.8 | learning rate: 4.699E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 4.109563E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 19:03:23,120] [INFO] [logging.py:60:log_dist] [Rank 0] step=1130, skipped=0, lr=[0.0004741258741258741, 0.0004741258741258741], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1130 loss: 4.0614 iter time (s): 62.710 samples/sec: 16.329 %comms: 0.0028730963974186105 %optimizer_step 0.05749154475522134 %forward: 23.174584952273918 %backward: 62.222508464795155 [2025-03-25 19:03:23,120] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26236.90 | forward: 145328.67 | backward_microstep: 390208.01 | backward: 390199.61 | backward_inner_microstep: 390181.75 | backward_inner: 390173.68 | backward_allreduce_microstep: 9.44 | backward_allreduce: 4.41 | reduce_tied_grads: 0.37 | comms: 18.02 | reduce_grads: 0.20 | step: 360.53 | _step_clipping: 0.14 | _step_step: 358.80 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.329 | iteration 1130/ 143000 | elapsed time per iteration (ms): 62710.9 | learning rate: 4.741E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 4.069877E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 19:13:39,133] [INFO] [logging.py:60:log_dist] [Rank 0] step=1140, skipped=0, lr=[0.00047832167832167826, 0.00047832167832167826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1140 loss: 4.0356 iter time (s): 61.601 samples/sec: 16.623 %comms: 0.0029392455847623285 %optimizer_step 0.05812586302893105 %forward: 23.605586685722663 %backward: 63.34364807970384 [2025-03-25 19:13:39,134] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14534.02 | forward: 145412.27 | backward_microstep: 390210.73 | backward: 390201.85 | backward_inner_microstep: 390186.13 | backward_inner: 390179.95 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.57 | reduce_tied_grads: 0.35 | comms: 18.11 | reduce_grads: 0.20 | step: 358.06 | _step_clipping: 0.14 | _step_step: 356.35 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 16.623 | iteration 1140/ 143000 | elapsed time per iteration (ms): 61601.4 | learning rate: 4.783E-04 | approx flops per GPU: 71.7TFLOPS | lm_loss: 4.055544E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 19:24:01,723] [INFO] [logging.py:60:log_dist] [Rank 0] step=1150, skipped=0, lr=[0.0004825174825174825, 0.0004825174825174825], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1150 loss: 4.0426 iter time (s): 62.258 samples/sec: 16.448 %comms: 0.0029703538557331057 %optimizer_step 0.057559303713089524 %forward: 23.347058240773695 %backward: 62.672114543406906 [2025-03-25 19:24:01,723] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20160.06 | forward: 145354.96 | backward_microstep: 390195.93 | backward: 390186.33 | backward_inner_microstep: 390170.09 | backward_inner: 390163.62 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.62 | reduce_tied_grads: 0.53 | comms: 18.49 | reduce_grads: 0.19 | step: 358.35 | _step_clipping: 0.13 | _step_step: 356.52 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.447 | iteration 1150/ 143000 | elapsed time per iteration (ms): 62259.0 | learning rate: 4.825E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 4.053972E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 19:34:27,593] [INFO] [logging.py:60:log_dist] [Rank 0] step=1160, skipped=0, lr=[0.0004867132867132867, 0.0004867132867132867], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1160 loss: 4.0357 iter time (s): 62.586 samples/sec: 16.361 %comms: 0.0028950125371916694 %optimizer_step 0.057810957762792575 %forward: 23.21456520654073 %backward: 62.31775169239609 [2025-03-25 19:34:27,594] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22155.37 | forward: 145291.74 | backward_microstep: 390032.45 | backward: 390024.74 | backward_inner_microstep: 390008.66 | backward_inner: 390002.66 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.66 | reduce_tied_grads: 0.39 | comms: 18.12 | reduce_grads: 0.21 | step: 361.82 | _step_clipping: 0.19 | _step_step: 359.87 | _step_zero_grad: 0.49 | _step_check_overflow: 0.68 samples/sec: 16.361 | iteration 1160/ 143000 | elapsed time per iteration (ms): 62587.1 | learning rate: 4.867E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 4.061697E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 19:44:51,740] [INFO] [logging.py:60:log_dist] [Rank 0] step=1170, skipped=0, lr=[0.0004909090909090909, 0.0004909090909090909], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1170 loss: 4.0121 iter time (s): 62.414 samples/sec: 16.407 %comms: 0.002890630817123781 %optimizer_step 0.056325052312445556 %forward: 23.270120968812353 %backward: 62.49046687169197 [2025-03-25 19:44:51,741] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18831.95 | forward: 145238.35 | backward_microstep: 390036.02 | backward: 390028.60 | backward_inner_microstep: 390012.60 | backward_inner: 390006.71 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.66 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.20 | step: 351.55 | _step_clipping: 0.14 | _step_step: 349.83 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.406 | iteration 1170/ 143000 | elapsed time per iteration (ms): 62414.7 | learning rate: 4.909E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 4.026213E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 19:55:18,735] [INFO] [logging.py:60:log_dist] [Rank 0] step=1180, skipped=0, lr=[0.000495104895104895, 0.000495104895104895], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1180 loss: 4.0162 iter time (s): 62.699 samples/sec: 16.332 %comms: 0.002848979176503493 %optimizer_step 0.055932957667785325 %forward: 23.186284583638614 %backward: 62.18827600608945 [2025-03-25 19:55:18,736] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20131.84 | forward: 145375.54 | backward_microstep: 389922.27 | backward: 389913.88 | backward_inner_microstep: 389897.56 | backward_inner: 389891.47 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.66 | reduce_tied_grads: 0.28 | comms: 17.86 | reduce_grads: 0.19 | step: 350.69 | _step_clipping: 0.13 | _step_step: 349.07 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.332 | iteration 1180/ 143000 | elapsed time per iteration (ms): 62699.5 | learning rate: 4.951E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 4.014919E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 20:05:45,309] [INFO] [logging.py:60:log_dist] [Rank 0] step=1190, skipped=0, lr=[0.0004993006993006993, 0.0004993006993006993], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1190 loss: 4.0282 iter time (s): 62.657 samples/sec: 16.343 %comms: 0.0028900489501294166 %optimizer_step 0.05696647170807887 %forward: 23.18390422330215 %backward: 62.23666799549964 [2025-03-25 20:05:45,309] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18672.40 | forward: 145262.99 | backward_microstep: 389963.76 | backward: 389955.21 | backward_inner_microstep: 389939.02 | backward_inner: 389933.03 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.70 | reduce_tied_grads: 0.29 | comms: 18.11 | reduce_grads: 0.21 | step: 356.93 | _step_clipping: 0.11 | _step_step: 355.24 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.343 | iteration 1190/ 143000 | elapsed time per iteration (ms): 62657.4 | learning rate: 4.993E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 4.023269E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 20:16:12,555] [INFO] [logging.py:60:log_dist] [Rank 0] step=1200, skipped=0, lr=[0.0005034965034965035, 0.0005034965034965035], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1200 loss: 3.9905 iter time (s): 62.724 samples/sec: 16.325 %comms: 0.002876800633158261 %optimizer_step 0.05770268951397119 %forward: 23.1813518679522 %backward: 62.18356128573227 [2025-03-25 20:16:12,556] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18236.13 | forward: 145402.93 | backward_microstep: 390050.39 | backward: 390040.76 | backward_inner_microstep: 390024.24 | backward_inner: 390017.87 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.73 | reduce_tied_grads: 0.34 | comms: 18.04 | reduce_grads: 0.20 | step: 361.93 | _step_clipping: 0.15 | _step_step: 360.17 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.325 | iteration 1200/ 143000 | elapsed time per iteration (ms): 62724.6 | learning rate: 5.035E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 4.012409E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 20:26:42,311] [INFO] [logging.py:60:log_dist] [Rank 0] step=1210, skipped=0, lr=[0.0005076923076923076, 0.0005076923076923076], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1210 loss: 3.9939 iter time (s): 62.975 samples/sec: 16.260 %comms: 0.0028789324385005213 %optimizer_step 0.05552629776477506 %forward: 23.056755065412304 %backward: 61.943138551933075 [2025-03-25 20:26:42,311] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20325.30 | forward: 145199.81 | backward_microstep: 390096.11 | backward: 390086.63 | backward_inner_microstep: 390070.62 | backward_inner: 390064.30 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.63 | reduce_tied_grads: 0.33 | comms: 18.13 | reduce_grads: 0.19 | step: 349.68 | _step_clipping: 0.14 | _step_step: 347.98 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.260 | iteration 1210/ 143000 | elapsed time per iteration (ms): 62975.5 | learning rate: 5.077E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.975056E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 20:37:14,266] [INFO] [logging.py:60:log_dist] [Rank 0] step=1220, skipped=0, lr=[0.0005118881118881119, 0.0005118881118881119], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1220 loss: 3.9745 iter time (s): 63.195 samples/sec: 16.204 %comms: 0.0028630617183312006 %optimizer_step 0.056597268743852146 %forward: 23.011102464999407 %backward: 61.719440199178436 [2025-03-25 20:37:14,267] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21958.90 | forward: 145418.59 | backward_microstep: 390045.67 | backward: 390035.80 | backward_inner_microstep: 390019.45 | backward_inner: 390013.15 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.78 | reduce_tied_grads: 0.36 | comms: 18.09 | reduce_grads: 0.20 | step: 357.67 | _step_clipping: 0.15 | _step_step: 355.87 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.204 | iteration 1220/ 143000 | elapsed time per iteration (ms): 63195.6 | learning rate: 5.119E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.972663E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 20:47:39,906] [INFO] [logging.py:60:log_dist] [Rank 0] step=1230, skipped=0, lr=[0.000516083916083916, 0.000516083916083916], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1230 loss: 3.9647 iter time (s): 62.563 samples/sec: 16.367 %comms: 0.0028763771206606593 %optimizer_step 0.056231707444401396 %forward: 23.219265159732036 %backward: 62.35784042741408 [2025-03-25 20:47:39,907] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15441.05 | forward: 145267.64 | backward_microstep: 390141.76 | backward: 390131.91 | backward_inner_microstep: 390115.99 | backward_inner: 390109.85 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.36 | comms: 18.00 | reduce_grads: 0.20 | step: 351.80 | _step_clipping: 0.15 | _step_step: 350.05 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.367 | iteration 1230/ 143000 | elapsed time per iteration (ms): 62564.0 | learning rate: 5.161E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.984523E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 20:58:14,633] [INFO] [logging.py:60:log_dist] [Rank 0] step=1240, skipped=0, lr=[0.0005202797202797202, 0.0005202797202797202], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1240 loss: 3.9621 iter time (s): 63.472 samples/sec: 16.133 %comms: 0.0028667149955193707 %optimizer_step 0.05577623403734112 %forward: 22.92196120393664 %backward: 61.46788341713564 [2025-03-25 20:58:14,634] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24142.43 | forward: 145490.40 | backward_microstep: 390159.58 | backward: 390149.29 | backward_inner_microstep: 390133.02 | backward_inner: 390126.72 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.64 | reduce_tied_grads: 0.35 | comms: 18.20 | reduce_grads: 0.20 | step: 354.02 | _step_clipping: 0.14 | _step_step: 352.23 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.133 | iteration 1240/ 143000 | elapsed time per iteration (ms): 63472.6 | learning rate: 5.203E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.968869E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 21:08:47,755] [INFO] [logging.py:60:log_dist] [Rank 0] step=1250, skipped=0, lr=[0.0005244755244755244, 0.0005244755244755244], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1250 loss: 3.9341 iter time (s): 63.312 samples/sec: 16.174 %comms: 0.0028474302381756977 %optimizer_step 0.05536874097313155 %forward: 22.976764531671158 %backward: 61.634085997544794 [2025-03-25 21:08:47,756] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22352.46 | forward: 145469.63 | backward_microstep: 390225.98 | backward: 390215.42 | backward_inner_microstep: 390199.34 | backward_inner: 390193.18 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.62 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.19 | step: 350.55 | _step_clipping: 0.33 | _step_step: 348.63 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.174 | iteration 1250/ 143000 | elapsed time per iteration (ms): 63312.2 | learning rate: 5.245E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.955084E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 21:19:13,613] [INFO] [logging.py:60:log_dist] [Rank 0] step=1260, skipped=0, lr=[0.0005286713286713286, 0.0005286713286713286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1260 loss: 3.9369 iter time (s): 62.585 samples/sec: 16.362 %comms: 0.002902689102864615 %optimizer_step 0.05574465165713824 %forward: 23.187894223325678 %backward: 62.306781894728225 [2025-03-25 21:19:13,613] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15653.53 | forward: 145121.93 | backward_microstep: 389957.93 | backward: 389948.33 | backward_inner_microstep: 389932.76 | backward_inner: 389926.78 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.57 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.21 | step: 348.88 | _step_clipping: 0.14 | _step_step: 346.90 | _step_zero_grad: 0.62 | _step_check_overflow: 0.66 samples/sec: 16.362 | iteration 1260/ 143000 | elapsed time per iteration (ms): 62585.8 | learning rate: 5.287E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.947623E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 21:29:40,074] [INFO] [logging.py:60:log_dist] [Rank 0] step=1270, skipped=0, lr=[0.0005328671328671328, 0.0005328671328671328], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1270 loss: 3.9066 iter time (s): 62.646 samples/sec: 16.346 %comms: 0.002870474301675491 %optimizer_step 0.05528213891073885 %forward: 23.184068254810285 %backward: 62.272755045456094 [2025-03-25 21:29:40,074] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15893.70 | forward: 145237.87 | backward_microstep: 390119.57 | backward: 390111.11 | backward_inner_microstep: 390095.31 | backward_inner: 390089.33 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.60 | reduce_tied_grads: 0.34 | comms: 17.98 | reduce_grads: 0.19 | step: 346.32 | _step_clipping: 0.13 | _step_step: 344.68 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.346 | iteration 1270/ 143000 | elapsed time per iteration (ms): 62646.1 | learning rate: 5.329E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.920485E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 21:40:10,202] [INFO] [logging.py:60:log_dist] [Rank 0] step=1280, skipped=0, lr=[0.0005370629370629369, 0.0005370629370629369], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1280 loss: 3.9373 iter time (s): 63.012 samples/sec: 16.251 %comms: 0.0029090128412884208 %optimizer_step 0.055671709521883536 %forward: 23.0685444403898 %backward: 61.929014494381704 [2025-03-25 21:40:10,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19222.41 | forward: 145360.02 | backward_microstep: 390237.50 | backward: 390228.47 | backward_inner_microstep: 390212.50 | backward_inner: 390206.36 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.61 | reduce_tied_grads: 0.34 | comms: 18.33 | reduce_grads: 0.18 | step: 350.80 | _step_clipping: 0.14 | _step_step: 349.02 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.251 | iteration 1280/ 143000 | elapsed time per iteration (ms): 63012.8 | learning rate: 5.371E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.925053E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 21:50:43,097] [INFO] [logging.py:60:log_dist] [Rank 0] step=1290, skipped=0, lr=[0.0005412587412587412, 0.0005412587412587412], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1290 loss: 3.9198 iter time (s): 63.289 samples/sec: 16.180 %comms: 0.0028549318758856823 %optimizer_step 0.056389585228105936 %forward: 22.9851742184082 %backward: 61.63721435664454 [2025-03-25 21:50:43,097] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21986.49 | forward: 145470.66 | backward_microstep: 390104.05 | backward: 390095.20 | backward_inner_microstep: 390078.84 | backward_inner: 390072.58 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.66 | reduce_tied_grads: 0.40 | comms: 18.07 | reduce_grads: 0.21 | step: 356.88 | _step_clipping: 0.15 | _step_step: 354.99 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 16.180 | iteration 1290/ 143000 | elapsed time per iteration (ms): 63289.5 | learning rate: 5.413E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.926586E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 22:01:16,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=1300, skipped=0, lr=[0.0005454545454545454, 0.0005454545454545454], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1300 loss: 3.8798 iter time (s): 63.290 samples/sec: 16.179 %comms: 0.002856308272733188 %optimizer_step 0.057433144288454335 %forward: 22.982108681005997 %backward: 61.62165405714409 [2025-03-25 22:01:16,005] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22087.43 | forward: 145454.06 | backward_microstep: 390012.53 | backward: 390004.24 | backward_inner_microstep: 389988.15 | backward_inner: 389982.08 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.63 | reduce_tied_grads: 0.36 | comms: 18.08 | reduce_grads: 0.22 | step: 363.50 | _step_clipping: 0.15 | _step_step: 361.72 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.179 | iteration 1300/ 143000 | elapsed time per iteration (ms): 63290.7 | learning rate: 5.455E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.891996E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 22:11:47,859] [INFO] [logging.py:60:log_dist] [Rank 0] step=1310, skipped=0, lr=[0.0005496503496503496, 0.0005496503496503496], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1310 loss: 3.8628 iter time (s): 63.185 samples/sec: 16.206 %comms: 0.003150137608477237 %optimizer_step 0.057116299471302465 %forward: 23.057136734764587 %backward: 61.79090309137274 [2025-03-25 22:11:47,860] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20391.48 | forward: 145686.46 | backward_microstep: 390435.42 | backward: 390425.67 | backward_inner_microstep: 390409.08 | backward_inner: 390402.85 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.84 | reduce_tied_grads: 0.34 | comms: 19.90 | reduce_grads: 1.98 | step: 360.89 | _step_clipping: 0.15 | _step_step: 359.09 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.206 | iteration 1310/ 143000 | elapsed time per iteration (ms): 63185.6 | learning rate: 5.497E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.888984E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 22:22:18,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=1320, skipped=0, lr=[0.0005538461538461538, 0.0005538461538461538], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1320 loss: 3.9084 iter time (s): 63.065 samples/sec: 16.237 %comms: 0.0028499682853744784 %optimizer_step 0.05506852495350759 %forward: 23.071976304607944 %backward: 61.86099373804901 [2025-03-25 22:22:18,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19929.77 | forward: 145502.28 | backward_microstep: 390131.67 | backward: 390123.31 | backward_inner_microstep: 390107.32 | backward_inner: 390101.29 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.59 | reduce_tied_grads: 0.33 | comms: 17.97 | reduce_grads: 0.18 | step: 347.29 | _step_clipping: 0.14 | _step_step: 345.64 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.237 | iteration 1320/ 143000 | elapsed time per iteration (ms): 63065.1 | learning rate: 5.538E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.886086E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 22:32:48,576] [INFO] [logging.py:60:log_dist] [Rank 0] step=1330, skipped=0, lr=[0.000558041958041958, 0.000558041958041958], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1330 loss: 3.8726 iter time (s): 63.006 samples/sec: 16.252 %comms: 0.002891779167802769 %optimizer_step 0.05842377774737083 %forward: 23.154613612086557 %backward: 61.93336911155133 [2025-03-25 22:32:48,577] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19171.51 | forward: 145887.99 | backward_microstep: 390229.29 | backward: 390217.47 | backward_inner_microstep: 390200.83 | backward_inner: 390194.37 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.67 | reduce_tied_grads: 0.34 | comms: 18.22 | reduce_grads: 0.19 | step: 368.10 | _step_clipping: 0.14 | _step_step: 366.11 | _step_zero_grad: 0.50 | _step_check_overflow: 0.78 samples/sec: 16.252 | iteration 1330/ 143000 | elapsed time per iteration (ms): 63006.6 | learning rate: 5.580E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.900123E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 22:43:19,915] [INFO] [logging.py:60:log_dist] [Rank 0] step=1340, skipped=0, lr=[0.0005622377622377622, 0.0005622377622377622], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1340 loss: 3.8753 iter time (s): 63.133 samples/sec: 16.220 %comms: 0.002839950785354549 %optimizer_step 0.055635157787702644 %forward: 23.082904915681095 %backward: 61.78613254123654 [2025-03-25 22:43:19,916] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21062.78 | forward: 145730.06 | backward_microstep: 390085.85 | backward: 390076.41 | backward_inner_microstep: 390060.21 | backward_inner: 390053.95 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.67 | reduce_tied_grads: 0.32 | comms: 17.93 | reduce_grads: 0.18 | step: 351.24 | _step_clipping: 0.11 | _step_step: 349.58 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.219 | iteration 1340/ 143000 | elapsed time per iteration (ms): 63133.9 | learning rate: 5.622E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.868945E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 22:53:44,735] [INFO] [logging.py:60:log_dist] [Rank 0] step=1350, skipped=0, lr=[0.0005664335664335664, 0.0005664335664335664], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1350 loss: 3.8857 iter time (s): 62.481 samples/sec: 16.389 %comms: 0.002883316722978738 %optimizer_step 0.056986391886763584 %forward: 23.272466253510274 %backward: 62.438326434538325 [2025-03-25 22:53:44,736] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15075.50 | forward: 145409.77 | backward_microstep: 390132.59 | backward: 390123.80 | backward_inner_microstep: 390107.86 | backward_inner: 390101.45 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 356.06 | _step_clipping: 0.13 | _step_step: 354.42 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.389 | iteration 1350/ 143000 | elapsed time per iteration (ms): 62482.0 | learning rate: 5.664E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.864583E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 23:04:09,186] [INFO] [logging.py:60:log_dist] [Rank 0] step=1360, skipped=0, lr=[0.0005706293706293706, 0.0005706293706293706], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1360 loss: 3.8248 iter time (s): 62.444 samples/sec: 16.399 %comms: 0.002890063969199125 %optimizer_step 0.05704522148351376 %forward: 23.265699191707252 %backward: 62.44971868947605 [2025-03-25 23:04:09,186] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15192.58 | forward: 145281.46 | backward_microstep: 389972.53 | backward: 389964.06 | backward_inner_microstep: 389948.35 | backward_inner: 389942.28 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.56 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.18 | step: 356.22 | _step_clipping: 0.13 | _step_step: 354.54 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.398 | iteration 1360/ 143000 | elapsed time per iteration (ms): 62445.0 | learning rate: 5.706E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.842892E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 23:14:33,765] [INFO] [logging.py:60:log_dist] [Rank 0] step=1370, skipped=0, lr=[0.0005748251748251748, 0.0005748251748251748], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1370 loss: 3.8350 iter time (s): 62.457 samples/sec: 16.395 %comms: 0.002892328621625453 %optimizer_step 0.05728810171579932 %forward: 23.256222254359802 %backward: 62.45199833488234 [2025-03-25 23:14:33,766] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15384.37 | forward: 145252.36 | backward_microstep: 390067.19 | backward: 390059.05 | backward_inner_microstep: 390043.24 | backward_inner: 390037.15 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.58 | reduce_tied_grads: 0.34 | comms: 18.06 | reduce_grads: 0.20 | step: 357.81 | _step_clipping: 0.12 | _step_step: 355.99 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.395 | iteration 1370/ 143000 | elapsed time per iteration (ms): 62458.0 | learning rate: 5.748E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.835397E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 23:24:59,799] [INFO] [logging.py:60:log_dist] [Rank 0] step=1380, skipped=0, lr=[0.000579020979020979, 0.000579020979020979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1380 loss: 3.8230 iter time (s): 62.603 samples/sec: 16.357 %comms: 0.0028561331821129604 %optimizer_step 0.05569983364447616 %forward: 23.2207689017105 %backward: 62.33015181601789 [2025-03-25 23:24:59,800] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16659.18 | forward: 145368.58 | backward_microstep: 390212.51 | backward: 390204.38 | backward_inner_microstep: 390188.33 | backward_inner: 390182.17 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.64 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 348.70 | _step_clipping: 0.11 | _step_step: 347.08 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.357 | iteration 1380/ 143000 | elapsed time per iteration (ms): 62603.4 | learning rate: 5.790E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.835835E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 23:35:25,641] [INFO] [logging.py:60:log_dist] [Rank 0] step=1390, skipped=0, lr=[0.0005832167832167832, 0.0005832167832167832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1390 loss: 3.8230 iter time (s): 62.584 samples/sec: 16.362 %comms: 0.0028737713768775363 %optimizer_step 0.05646877896775405 %forward: 23.221678814220375 %backward: 62.334794094162746 [2025-03-25 23:35:25,642] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16691.24 | forward: 145329.70 | backward_microstep: 390123.71 | backward: 390113.79 | backward_inner_microstep: 390097.51 | backward_inner: 390091.32 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.21 | step: 353.40 | _step_clipping: 0.12 | _step_step: 351.75 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.362 | iteration 1390/ 143000 | elapsed time per iteration (ms): 62584.2 | learning rate: 5.832E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.825817E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 23:45:51,176] [INFO] [logging.py:60:log_dist] [Rank 0] step=1400, skipped=0, lr=[0.0005874125874125874, 0.0005874125874125874], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1400 loss: 3.8127 iter time (s): 62.553 samples/sec: 16.370 %comms: 0.002879716687043144 %optimizer_step 0.05747122331786544 %forward: 23.249120229095585 %backward: 62.356747063340165 [2025-03-25 23:45:51,177] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16379.61 | forward: 145430.10 | backward_microstep: 390067.81 | backward: 390059.85 | backward_inner_microstep: 390043.71 | backward_inner: 390037.48 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.67 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.22 | step: 359.50 | _step_clipping: 0.13 | _step_step: 357.78 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.370 | iteration 1400/ 143000 | elapsed time per iteration (ms): 62553.5 | learning rate: 5.874E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.811476E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-25 23:56:16,427] [INFO] [logging.py:60:log_dist] [Rank 0] step=1410, skipped=0, lr=[0.0005916083916083916, 0.0005916083916083916], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1410 loss: 3.8697 iter time (s): 62.525 samples/sec: 16.378 %comms: 0.0028666511076318214 %optimizer_step 0.057314604385433884 %forward: 23.25265717798604 %backward: 62.389760807130344 [2025-03-25 23:56:16,427] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16192.10 | forward: 145386.08 | backward_microstep: 390097.08 | backward: 390088.88 | backward_inner_microstep: 390072.50 | backward_inner: 390066.33 | backward_allreduce_microstep: 7.95 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 17.92 | reduce_grads: 0.19 | step: 358.36 | _step_clipping: 0.11 | _step_step: 356.66 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.377 | iteration 1410/ 143000 | elapsed time per iteration (ms): 62525.1 | learning rate: 5.916E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.849541E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 00:06:43,159] [INFO] [logging.py:60:log_dist] [Rank 0] step=1420, skipped=0, lr=[0.0005958041958041958, 0.0005958041958041958], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1420 loss: 3.8291 iter time (s): 62.673 samples/sec: 16.339 %comms: 0.002851960188502863 %optimizer_step 0.055883948571182326 %forward: 23.267985373991376 %backward: 62.32063338229699 [2025-03-26 00:06:43,159] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16693.68 | forward: 145826.73 | backward_microstep: 390589.06 | backward: 390580.19 | backward_inner_microstep: 390563.81 | backward_inner: 390557.58 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.69 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.19 | step: 350.24 | _step_clipping: 0.12 | _step_step: 348.66 | _step_zero_grad: 0.45 | _step_check_overflow: 0.46 samples/sec: 16.339 | iteration 1420/ 143000 | elapsed time per iteration (ms): 62673.2 | learning rate: 5.958E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.851433E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 00:17:14,155] [INFO] [logging.py:60:log_dist] [Rank 0] step=1430, skipped=0, lr=[0.0006, 0.0006], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1430 loss: 3.8013 iter time (s): 63.099 samples/sec: 16.228 %comms: 0.002848295824918769 %optimizer_step 0.05527281618770725 %forward: 23.041045174622436 %backward: 61.84498864219463 [2025-03-26 00:17:14,155] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21748.46 | forward: 145386.75 | backward_microstep: 390243.28 | backward: 390235.86 | backward_inner_microstep: 390220.27 | backward_inner: 390214.42 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.57 | reduce_tied_grads: 0.33 | comms: 17.97 | reduce_grads: 0.19 | step: 348.77 | _step_clipping: 0.14 | _step_step: 347.12 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.228 | iteration 1430/ 143000 | elapsed time per iteration (ms): 63099.6 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.810093E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 00:27:41,502] [INFO] [logging.py:60:log_dist] [Rank 0] step=1440, skipped=0, lr=[0.0005999999927603273, 0.0005999999927603273], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1440 loss: 3.7927 iter time (s): 62.734 samples/sec: 16.323 %comms: 0.002851408807351395 %optimizer_step 0.055765397445185064 %forward: 23.19126100526679 %backward: 62.21193261812695 [2025-03-26 00:27:41,502] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17904.92 | forward: 145488.39 | backward_microstep: 390288.00 | backward: 390281.24 | backward_inner_microstep: 390265.44 | backward_inner: 390259.57 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.62 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 349.84 | _step_clipping: 0.13 | _step_step: 347.82 | _step_zero_grad: 0.45 | _step_check_overflow: 0.89 samples/sec: 16.323 | iteration 1440/ 143000 | elapsed time per iteration (ms): 62734.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.778848E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 00:38:05,896] [INFO] [logging.py:60:log_dist] [Rank 0] step=1450, skipped=0, lr=[0.0005999999710413098, 0.0005999999710413098], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1450 loss: 3.7928 iter time (s): 62.439 samples/sec: 16.400 %comms: 0.0028617593654710377 %optimizer_step 0.05571950033142317 %forward: 23.259104729271222 %backward: 62.4597565805186 [2025-03-26 00:38:05,897] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15456.40 | forward: 145227.36 | backward_microstep: 389998.87 | backward: 389992.03 | backward_inner_microstep: 389975.89 | backward_inner: 389970.02 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.54 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.18 | step: 347.91 | _step_clipping: 0.11 | _step_step: 346.31 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.400 | iteration 1450/ 143000 | elapsed time per iteration (ms): 62439.5 | learning rate: 6.000E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.778609E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 00:48:30,728] [INFO] [logging.py:60:log_dist] [Rank 0] step=1460, skipped=0, lr=[0.0005999999348429484, 0.0005999999348429484], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1460 loss: 3.7869 iter time (s): 62.483 samples/sec: 16.389 %comms: 0.002891503414137753 %optimizer_step 0.056192078799545006 %forward: 23.266897795821308 %backward: 62.462203407937956 [2025-03-26 00:48:30,729] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15342.05 | forward: 145377.77 | backward_microstep: 390287.81 | backward: 390280.48 | backward_inner_microstep: 390264.95 | backward_inner: 390259.04 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.19 | step: 351.10 | _step_clipping: 0.11 | _step_step: 349.35 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.388 | iteration 1460/ 143000 | elapsed time per iteration (ms): 62483.2 | learning rate: 6.000E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.779295E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 00:58:56,611] [INFO] [logging.py:60:log_dist] [Rank 0] step=1470, skipped=0, lr=[0.000599999884165245, 0.000599999884165245], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1470 loss: 3.7488 iter time (s): 62.588 samples/sec: 16.361 %comms: 0.0028693943326135985 %optimizer_step 0.056469642372316435 %forward: 23.22479441764801 %backward: 62.35410259767673 [2025-03-26 00:58:56,612] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16359.84 | forward: 145358.65 | backward_microstep: 390267.46 | backward: 390259.99 | backward_inner_microstep: 390244.05 | backward_inner: 390238.09 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 353.43 | _step_clipping: 0.12 | _step_step: 351.75 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.361 | iteration 1470/ 143000 | elapsed time per iteration (ms): 62588.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.749337E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 01:09:21,761] [INFO] [logging.py:60:log_dist] [Rank 0] step=1480, skipped=0, lr=[0.0005999998190082019, 0.0005999998190082019], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1480 loss: 3.7290 iter time (s): 62.514 samples/sec: 16.380 %comms: 0.002859826627215455 %optimizer_step 0.05610498081601937 %forward: 23.247172535492176 %backward: 62.411156495413934 [2025-03-26 01:09:21,762] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15649.54 | forward: 145328.47 | backward_microstep: 390167.51 | backward: 390160.06 | backward_inner_microstep: 390144.46 | backward_inner: 390138.51 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.19 | step: 350.74 | _step_clipping: 0.12 | _step_step: 349.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.380 | iteration 1480/ 143000 | elapsed time per iteration (ms): 62515.0 | learning rate: 6.000E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.745055E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 01:19:46,934] [INFO] [logging.py:60:log_dist] [Rank 0] step=1490, skipped=0, lr=[0.0005999997393718223, 0.0005999997393718223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1490 loss: 3.7623 iter time (s): 62.517 samples/sec: 16.380 %comms: 0.002876540817656943 %optimizer_step 0.05625129017586536 %forward: 23.260943042469037 %backward: 62.45679840094 [2025-03-26 01:19:46,935] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15109.16 | forward: 145419.85 | backward_microstep: 390467.86 | backward: 390459.60 | backward_inner_microstep: 390443.60 | backward_inner: 390437.55 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.19 | step: 351.66 | _step_clipping: 0.13 | _step_step: 350.01 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.379 | iteration 1490/ 143000 | elapsed time per iteration (ms): 62517.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.746742E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 01:30:14,688] [INFO] [logging.py:60:log_dist] [Rank 0] step=1500, skipped=0, lr=[0.00059999964525611, 0.00059999964525611], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1500 loss: 3.7332 iter time (s): 62.775 samples/sec: 16.312 %comms: 0.0028675261299392412 %optimizer_step 0.056889597616162836 %forward: 23.193055257263463 %backward: 62.22839818317364 [2025-03-26 01:30:14,689] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17193.22 | forward: 145593.97 | backward_microstep: 390645.38 | backward: 390637.59 | backward_inner_microstep: 390621.76 | backward_inner: 390615.80 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.66 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.20 | step: 357.12 | _step_clipping: 0.12 | _step_step: 355.29 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.312 | iteration 1500/ 143000 | elapsed time per iteration (ms): 62775.4 | learning rate: 6.000E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.742554E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 01:40:40,756] [INFO] [logging.py:60:log_dist] [Rank 0] step=1510, skipped=0, lr=[0.0005999995366610697, 0.0005999995366610697], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1510 loss: 3.7094 iter time (s): 62.606 samples/sec: 16.356 %comms: 0.002901488212186029 %optimizer_step 0.05567944073681041 %forward: 23.205462558091725 %backward: 62.307801749693894 [2025-03-26 01:40:40,756] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16337.79 | forward: 145280.55 | backward_microstep: 390093.97 | backward: 390085.39 | backward_inner_microstep: 390069.69 | backward_inner: 390063.84 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.78 | reduce_tied_grads: 0.25 | comms: 18.17 | reduce_grads: 0.17 | step: 348.59 | _step_clipping: 0.11 | _step_step: 346.94 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.356 | iteration 1510/ 143000 | elapsed time per iteration (ms): 62606.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.726287E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 01:51:06,086] [INFO] [logging.py:60:log_dist] [Rank 0] step=1520, skipped=0, lr=[0.0005999994135867064, 0.0005999994135867064], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1520 loss: 3.7133 iter time (s): 62.533 samples/sec: 16.375 %comms: 0.0028770350539733037 %optimizer_step 0.057377593220086835 %forward: 23.240626218105785 %backward: 62.41411300307937 [2025-03-26 01:51:06,087] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15153.21 | forward: 145329.51 | backward_microstep: 390298.75 | backward: 390291.23 | backward_inner_microstep: 390275.71 | backward_inner: 390269.62 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 17.99 | reduce_grads: 0.18 | step: 358.80 | _step_clipping: 0.12 | _step_step: 357.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.375 | iteration 1520/ 143000 | elapsed time per iteration (ms): 62533.1 | learning rate: 6.000E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.703902E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 02:01:32,311] [INFO] [logging.py:60:log_dist] [Rank 0] step=1530, skipped=0, lr=[0.0005999992760330261, 0.0005999992760330261], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1530 loss: 3.7043 iter time (s): 62.622 samples/sec: 16.352 %comms: 0.002923182203932642 %optimizer_step 0.05733151490249607 %forward: 23.22401664294765 %backward: 62.34250789545216 [2025-03-26 02:01:32,312] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15662.80 | forward: 145433.35 | backward_microstep: 390409.78 | backward: 390401.01 | backward_inner_microstep: 390385.21 | backward_inner: 390379.05 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.32 | comms: 18.31 | reduce_grads: 0.20 | step: 359.02 | _step_clipping: 0.12 | _step_step: 357.24 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.352 | iteration 1530/ 143000 | elapsed time per iteration (ms): 62622.5 | learning rate: 6.000E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.706842E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 02:11:57,325] [INFO] [logging.py:60:log_dist] [Rank 0] step=1540, skipped=0, lr=[0.0005999991240000356, 0.0005999991240000356], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1540 loss: 3.7274 iter time (s): 62.501 samples/sec: 16.384 %comms: 0.002905313212213266 %optimizer_step 0.05618799154262187 %forward: 23.24249688586621 %backward: 62.42803674078421 [2025-03-26 02:11:57,325] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14726.30 | forward: 145267.43 | backward_microstep: 390187.80 | backward: 390180.13 | backward_inner_microstep: 390164.47 | backward_inner: 390158.42 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.58 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.20 | step: 351.18 | _step_clipping: 0.11 | _step_step: 349.51 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.384 | iteration 1540/ 143000 | elapsed time per iteration (ms): 62501.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.694256E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 02:22:21,554] [INFO] [logging.py:60:log_dist] [Rank 0] step=1550, skipped=0, lr=[0.0005999989574877421, 0.0005999989574877421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1550 loss: 3.6692 iter time (s): 62.422 samples/sec: 16.404 %comms: 0.002899756111837758 %optimizer_step 0.05639901696736214 %forward: 23.266769001017064 %backward: 62.4849110540843 [2025-03-26 02:22:21,555] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13984.38 | forward: 145236.78 | backward_microstep: 390054.21 | backward: 390045.88 | backward_inner_microstep: 390030.08 | backward_inner: 390024.04 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.60 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.19 | step: 352.06 | _step_clipping: 0.12 | _step_step: 350.21 | _step_zero_grad: 0.47 | _step_check_overflow: 0.67 samples/sec: 16.404 | iteration 1550/ 143000 | elapsed time per iteration (ms): 62423.0 | learning rate: 6.000E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.701035E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 02:32:47,026] [INFO] [logging.py:60:log_dist] [Rank 0] step=1560, skipped=0, lr=[0.0005999987764961538, 0.0005999987764961538], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1560 loss: 3.6802 iter time (s): 62.547 samples/sec: 16.372 %comms: 0.0028822178903153304 %optimizer_step 0.0559494557236348 %forward: 23.23646983051212 %backward: 62.38700286050215 [2025-03-26 02:32:47,027] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14764.02 | forward: 145336.32 | backward_microstep: 390218.03 | backward: 390209.77 | backward_inner_microstep: 390193.76 | backward_inner: 390187.74 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.59 | reduce_tied_grads: 0.29 | comms: 18.03 | reduce_grads: 0.19 | step: 349.95 | _step_clipping: 0.11 | _step_step: 348.28 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.372 | iteration 1560/ 143000 | elapsed time per iteration (ms): 62547.2 | learning rate: 6.000E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.683376E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 02:43:15,264] [INFO] [logging.py:60:log_dist] [Rank 0] step=1570, skipped=0, lr=[0.0005999985810252791, 0.0005999985810252791], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1570 loss: 3.6664 iter time (s): 62.823 samples/sec: 16.300 %comms: 0.002887977436459766 %optimizer_step 0.05588648864382551 %forward: 23.211283311221813 %backward: 62.14604026828481 [2025-03-26 02:43:15,264] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16649.57 | forward: 145820.54 | backward_microstep: 390430.00 | backward: 390420.85 | backward_inner_microstep: 390404.64 | backward_inner: 390398.40 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.65 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.19 | step: 351.10 | _step_clipping: 0.12 | _step_step: 349.36 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.300 | iteration 1570/ 143000 | elapsed time per iteration (ms): 62823.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.672992E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 02:53:41,058] [INFO] [logging.py:60:log_dist] [Rank 0] step=1580, skipped=0, lr=[0.0005999983710751279, 0.0005999983710751279], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1580 loss: 3.6498 iter time (s): 62.579 samples/sec: 16.363 %comms: 0.002864961446906676 %optimizer_step 0.05608821714857294 %forward: 23.21295404734652 %backward: 62.33256856207863 [2025-03-26 02:53:41,058] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15016.48 | forward: 145264.01 | backward_microstep: 390077.98 | backward: 390070.07 | backward_inner_microstep: 390054.18 | backward_inner: 390047.88 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.19 | step: 350.99 | _step_clipping: 0.11 | _step_step: 349.34 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.363 | iteration 1580/ 143000 | elapsed time per iteration (ms): 62579.4 | learning rate: 6.000E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.660829E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 03:04:10,941] [INFO] [logging.py:60:log_dist] [Rank 0] step=1590, skipped=0, lr=[0.0005999981466457099, 0.0005999981466457099], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1590 loss: 3.6651 iter time (s): 62.987 samples/sec: 16.257 %comms: 0.002851761832711488 %optimizer_step 0.05582109924321232 %forward: 23.082020903286292 %backward: 61.91538638757835 [2025-03-26 03:04:10,941] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18899.75 | forward: 145387.24 | backward_microstep: 389996.00 | backward: 389987.82 | backward_inner_microstep: 389971.55 | backward_inner: 389965.45 | backward_allreduce_microstep: 7.79 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 351.60 | _step_clipping: 0.13 | _step_step: 349.89 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.257 | iteration 1590/ 143000 | elapsed time per iteration (ms): 62988.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.664817E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 03:14:35,990] [INFO] [logging.py:60:log_dist] [Rank 0] step=1600, skipped=0, lr=[0.0005999979077370362, 0.0005999979077370362], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1600 loss: 3.6678 iter time (s): 62.504 samples/sec: 16.383 %comms: 0.0028771122862147167 %optimizer_step 0.05631483751309918 %forward: 23.2236860578411 %backward: 62.39364686460813 [2025-03-26 03:14:35,990] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14144.36 | forward: 145158.10 | backward_microstep: 389995.21 | backward: 389987.32 | backward_inner_microstep: 389970.00 | backward_inner: 389963.79 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.18 | step: 351.99 | _step_clipping: 0.14 | _step_step: 350.24 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.383 | iteration 1600/ 143000 | elapsed time per iteration (ms): 62504.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.666308E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 03:25:11,680] [INFO] [logging.py:60:log_dist] [Rank 0] step=1610, skipped=0, lr=[0.0005999976543491182, 0.0005999976543491182], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1610 loss: 3.6164 iter time (s): 63.568 samples/sec: 16.109 %comms: 0.0031180011548332456 %optimizer_step 0.05563516525367114 %forward: 22.85203202197725 %backward: 61.3597804556926 [2025-03-26 03:25:11,681] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24466.29 | forward: 145265.37 | backward_microstep: 390059.74 | backward: 390050.71 | backward_inner_microstep: 390034.71 | backward_inner: 390028.52 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 19.82 | reduce_grads: 0.20 | step: 353.66 | _step_clipping: 0.14 | _step_step: 351.83 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.108 | iteration 1610/ 143000 | elapsed time per iteration (ms): 63569.1 | learning rate: 6.000E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 3.646093E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 03:35:42,949] [INFO] [logging.py:60:log_dist] [Rank 0] step=1620, skipped=0, lr=[0.0005999973864819681, 0.0005999973864819681], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1620 loss: 3.6248 iter time (s): 63.126 samples/sec: 16.221 %comms: 0.002839132114008602 %optimizer_step 0.05465548376645578 %forward: 23.00585969040516 %backward: 61.766426814213396 [2025-03-26 03:35:42,950] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20109.41 | forward: 145227.57 | backward_microstep: 389917.60 | backward: 389908.84 | backward_inner_microstep: 389893.13 | backward_inner: 389887.06 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.20 | step: 345.02 | _step_clipping: 0.13 | _step_step: 343.34 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.221 | iteration 1620/ 143000 | elapsed time per iteration (ms): 63126.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.630104E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 03:46:12,288] [INFO] [logging.py:60:log_dist] [Rank 0] step=1630, skipped=0, lr=[0.0005999971041355989, 0.0005999971041355989], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1630 loss: 3.6155 iter time (s): 62.933 samples/sec: 16.271 %comms: 0.002847577990125371 %optimizer_step 0.05600176098935724 %forward: 23.08082857569248 %backward: 61.969144288933144 [2025-03-26 03:46:12,288] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17877.57 | forward: 145255.15 | backward_microstep: 390002.04 | backward: 389991.94 | backward_inner_microstep: 389976.13 | backward_inner: 389970.03 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 352.44 | _step_clipping: 0.13 | _step_step: 350.74 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.271 | iteration 1630/ 143000 | elapsed time per iteration (ms): 62933.8 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.645093E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 03:56:40,555] [INFO] [logging.py:60:log_dist] [Rank 0] step=1640, skipped=0, lr=[0.0005999968073100244, 0.0005999968073100244], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1640 loss: 3.6273 iter time (s): 62.826 samples/sec: 16.299 %comms: 0.002856643014549752 %optimizer_step 0.05694774545193646 %forward: 23.12118535614828 %backward: 62.08044235460898 [2025-03-26 03:56:40,555] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16644.68 | forward: 145261.58 | backward_microstep: 390035.68 | backward: 390027.71 | backward_inner_microstep: 390012.19 | backward_inner: 390006.31 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 357.78 | _step_clipping: 0.13 | _step_step: 355.87 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.299 | iteration 1640/ 143000 | elapsed time per iteration (ms): 62826.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.626587E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 04:07:08,284] [INFO] [logging.py:60:log_dist] [Rank 0] step=1650, skipped=0, lr=[0.0005999964960052586, 0.0005999964960052586], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1650 loss: 3.6247 iter time (s): 62.772 samples/sec: 16.313 %comms: 0.0028710559808504017 %optimizer_step 0.05953790193136761 %forward: 23.151438877023995 %backward: 62.15551808616894 [2025-03-26 04:07:08,284] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15721.13 | forward: 145327.07 | backward_microstep: 390174.06 | backward: 390164.92 | backward_inner_microstep: 390149.13 | backward_inner: 390143.00 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 18.02 | reduce_grads: 0.18 | step: 373.73 | _step_clipping: 0.12 | _step_step: 371.98 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.313 | iteration 1650/ 143000 | elapsed time per iteration (ms): 62772.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.624984E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 04:17:35,017] [INFO] [logging.py:60:log_dist] [Rank 0] step=1660, skipped=0, lr=[0.0005999961702213167, 0.0005999961702213167], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1660 loss: 3.6044 iter time (s): 62.673 samples/sec: 16.339 %comms: 0.0028644728296776637 %optimizer_step 0.056055075675369784 %forward: 23.178726581868972 %backward: 62.23259078776644 [2025-03-26 04:17:35,017] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14877.40 | forward: 145267.48 | backward_microstep: 390036.86 | backward: 390028.83 | backward_inner_microstep: 390013.21 | backward_inner: 390007.29 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.20 | step: 351.31 | _step_clipping: 0.11 | _step_step: 349.63 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.339 | iteration 1660/ 143000 | elapsed time per iteration (ms): 62673.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.610514E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 04:28:03,486] [INFO] [logging.py:60:log_dist] [Rank 0] step=1670, skipped=0, lr=[0.0005999958299582143, 0.0005999958299582143], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1670 loss: 3.6168 iter time (s): 62.846 samples/sec: 16.294 %comms: 0.0028605818355015534 %optimizer_step 0.057167212789159864 %forward: 23.114042246681368 %backward: 62.05426761074183 [2025-03-26 04:28:03,487] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16560.37 | forward: 145263.33 | backward_microstep: 389997.75 | backward: 389988.46 | backward_inner_microstep: 389970.71 | backward_inner: 389964.68 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.60 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.18 | step: 359.28 | _step_clipping: 0.13 | _step_step: 357.46 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 16.294 | iteration 1670/ 143000 | elapsed time per iteration (ms): 62846.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.610066E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 04:38:32,470] [INFO] [logging.py:60:log_dist] [Rank 0] step=1680, skipped=0, lr=[0.0005999954752159681, 0.0005999954752159681], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1680 loss: 3.6048 iter time (s): 62.898 samples/sec: 16.280 %comms: 0.002849936845479106 %optimizer_step 0.05585955819019416 %forward: 23.103618031120003 %backward: 62.00929519875365 [2025-03-26 04:38:32,471] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16882.18 | forward: 145316.88 | backward_microstep: 390033.54 | backward: 390025.37 | backward_inner_microstep: 390009.58 | backward_inner: 390003.59 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.19 | step: 351.34 | _step_clipping: 0.12 | _step_step: 349.68 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.280 | iteration 1680/ 143000 | elapsed time per iteration (ms): 62898.4 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.607843E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 04:49:03,014] [INFO] [logging.py:60:log_dist] [Rank 0] step=1690, skipped=0, lr=[0.000599995105994595, 0.000599995105994595], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1690 loss: 3.6057 iter time (s): 63.054 samples/sec: 16.240 %comms: 0.0028485617690654926 %optimizer_step 0.05737862111834989 %forward: 23.02686237610068 %backward: 61.83242917477113 [2025-03-26 04:49:03,015] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18633.93 | forward: 145193.11 | backward_microstep: 389884.07 | backward: 389876.95 | backward_inner_microstep: 389861.18 | backward_inner: 389855.42 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 361.79 | _step_clipping: 0.13 | _step_step: 359.97 | _step_zero_grad: 0.48 | _step_check_overflow: 0.66 samples/sec: 16.240 | iteration 1690/ 143000 | elapsed time per iteration (ms): 63054.4 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.590606E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 04:59:32,295] [INFO] [logging.py:60:log_dist] [Rank 0] step=1700, skipped=0, lr=[0.0005999947222941128, 0.0005999947222941128], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1700 loss: 3.5701 iter time (s): 62.928 samples/sec: 16.273 %comms: 0.0028619668587435486 %optimizer_step 0.05621322252482264 %forward: 23.085191521649183 %backward: 61.979732328012005 [2025-03-26 04:59:32,296] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17059.09 | forward: 145269.51 | backward_microstep: 390030.66 | backward: 390023.41 | backward_inner_microstep: 390007.67 | backward_inner: 390001.85 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.66 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.20 | step: 353.74 | _step_clipping: 0.13 | _step_step: 351.97 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.273 | iteration 1700/ 143000 | elapsed time per iteration (ms): 62928.1 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.582825E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 05:10:06,158] [INFO] [logging.py:60:log_dist] [Rank 0] step=1710, skipped=0, lr=[0.0005999943241145402, 0.0005999943241145402], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1710 loss: 3.5820 iter time (s): 63.386 samples/sec: 16.155 %comms: 0.0028269882201231162 %optimizer_step 0.05516963916088852 %forward: 22.986243737954787 %backward: 61.56753865334848 [2025-03-26 05:10:06,159] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20887.95 | forward: 145699.92 | backward_microstep: 390259.32 | backward: 390250.17 | backward_inner_microstep: 390234.25 | backward_inner: 390228.20 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.63 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 349.70 | _step_clipping: 0.12 | _step_step: 347.98 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.155 | iteration 1710/ 143000 | elapsed time per iteration (ms): 63386.3 | learning rate: 6.000E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 3.581252E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 05:20:36,624] [INFO] [logging.py:60:log_dist] [Rank 0] step=1720, skipped=0, lr=[0.0005999939114558962, 0.0005999939114558962], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1720 loss: 3.5828 iter time (s): 63.046 samples/sec: 16.242 %comms: 0.0028475897440531617 %optimizer_step 0.055524596515318665 %forward: 23.079078611308596 %backward: 61.890328478450854 [2025-03-26 05:20:36,624] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17721.96 | forward: 145504.40 | backward_microstep: 390202.94 | backward: 390193.87 | backward_inner_microstep: 390173.88 | backward_inner: 390165.77 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.71 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.19 | step: 350.06 | _step_clipping: 0.15 | _step_step: 348.38 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.242 | iteration 1720/ 143000 | elapsed time per iteration (ms): 63046.6 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.580400E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 05:31:05,299] [INFO] [logging.py:60:log_dist] [Rank 0] step=1730, skipped=0, lr=[0.0005999934843182008, 0.0005999934843182008], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1730 loss: 3.5548 iter time (s): 62.867 samples/sec: 16.288 %comms: 0.0028588501814702395 %optimizer_step 0.05600002187779066 %forward: 23.11469437020563 %backward: 62.053365541925345 [2025-03-26 05:31:05,299] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16135.07 | forward: 145314.94 | backward_microstep: 390119.47 | backward: 390110.34 | backward_inner_microstep: 390094.13 | backward_inner: 390088.00 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.19 | step: 352.05 | _step_clipping: 0.13 | _step_step: 350.33 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.288 | iteration 1730/ 143000 | elapsed time per iteration (ms): 62867.5 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.555056E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 05:41:33,914] [INFO] [logging.py:60:log_dist] [Rank 0] step=1740, skipped=0, lr=[0.0005999930427014748, 0.0005999930427014748], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1740 loss: 3.5568 iter time (s): 62.861 samples/sec: 16.290 %comms: 0.0028859000799032375 %optimizer_step 0.05715094076016922 %forward: 23.126312286579616 %backward: 62.048114604998695 [2025-03-26 05:41:33,914] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16030.56 | forward: 145374.11 | backward_microstep: 390050.59 | backward: 390040.11 | backward_inner_microstep: 390021.78 | backward_inner: 390015.55 | backward_allreduce_microstep: 9.65 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.20 | step: 359.26 | _step_clipping: 0.13 | _step_step: 357.44 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.290 | iteration 1740/ 143000 | elapsed time per iteration (ms): 62861.5 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.562944E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 05:52:02,781] [INFO] [logging.py:60:log_dist] [Rank 0] step=1750, skipped=0, lr=[0.0005999925866057392, 0.0005999925866057392], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1750 loss: 3.5613 iter time (s): 62.886 samples/sec: 16.283 %comms: 0.002904986977301997 %optimizer_step 0.05641592154592361 %forward: 23.11932648035217 %backward: 62.03834422894151 [2025-03-26 05:52:02,782] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16148.80 | forward: 145388.56 | backward_microstep: 390145.54 | backward: 390135.31 | backward_inner_microstep: 390119.23 | backward_inner: 390112.98 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.62 | reduce_tied_grads: 0.30 | comms: 18.27 | reduce_grads: 0.19 | step: 354.78 | _step_clipping: 0.12 | _step_step: 352.97 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.283 | iteration 1750/ 143000 | elapsed time per iteration (ms): 62886.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.562029E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 06:02:32,394] [INFO] [logging.py:60:log_dist] [Rank 0] step=1760, skipped=0, lr=[0.0005999921160310161, 0.0005999921160310161], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1760 loss: 3.5436 iter time (s): 62.961 samples/sec: 16.264 %comms: 0.002853681925586934 %optimizer_step 0.060442209344320215 %forward: 23.147198775662115 %backward: 61.95878548057633 [2025-03-26 06:02:32,395] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16518.32 | forward: 145736.42 | backward_microstep: 390107.62 | backward: 390096.95 | backward_inner_microstep: 390080.40 | backward_inner: 390074.14 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 380.55 | _step_clipping: 0.11 | _step_step: 378.67 | _step_zero_grad: 0.53 | _step_check_overflow: 0.63 samples/sec: 16.264 | iteration 1760/ 143000 | elapsed time per iteration (ms): 62961.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.554285E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 06:13:02,260] [INFO] [logging.py:60:log_dist] [Rank 0] step=1770, skipped=0, lr=[0.0005999916309773285, 0.0005999916309773285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1770 loss: 3.5437 iter time (s): 62.986 samples/sec: 16.258 %comms: 0.0028816056744242658 %optimizer_step 0.057626814123024714 %forward: 23.120670005588654 %backward: 61.93524616390054 [2025-03-26 06:13:02,261] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16847.62 | forward: 145627.93 | backward_microstep: 390116.28 | backward: 390105.55 | backward_inner_microstep: 390088.69 | backward_inner: 390082.26 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.78 | reduce_tied_grads: 0.32 | comms: 18.15 | reduce_grads: 0.20 | step: 362.97 | _step_clipping: 0.12 | _step_step: 361.18 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.257 | iteration 1770/ 143000 | elapsed time per iteration (ms): 62986.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.543198E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 06:23:30,670] [INFO] [logging.py:60:log_dist] [Rank 0] step=1780, skipped=0, lr=[0.0005999911314446994, 0.0005999911314446994], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1780 loss: 3.5455 iter time (s): 62.840 samples/sec: 16.295 %comms: 0.0028475796025817456 %optimizer_step 0.05590637167068887 %forward: 23.15198405613417 %backward: 62.10759195229932 [2025-03-26 06:23:30,670] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15342.75 | forward: 145487.70 | backward_microstep: 390294.50 | backward: 390285.80 | backward_inner_microstep: 390269.36 | backward_inner: 390261.52 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.72 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 351.32 | _step_clipping: 0.11 | _step_step: 349.47 | _step_zero_grad: 0.65 | _step_check_overflow: 0.54 samples/sec: 16.295 | iteration 1780/ 143000 | elapsed time per iteration (ms): 62840.8 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.546813E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 06:33:57,899] [INFO] [logging.py:60:log_dist] [Rank 0] step=1790, skipped=0, lr=[0.0005999906174331531, 0.0005999906174331531], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1790 loss: 3.5224 iter time (s): 62.722 samples/sec: 16.326 %comms: 0.0028886991496866314 %optimizer_step 0.058376202358539325 %forward: 23.171335335896128 %backward: 62.22127470457364 [2025-03-26 06:33:57,900] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14302.62 | forward: 145336.22 | backward_microstep: 390276.07 | backward: 390266.89 | backward_inner_microstep: 390250.78 | backward_inner: 390244.58 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.70 | reduce_tied_grads: 0.32 | comms: 18.12 | reduce_grads: 0.19 | step: 366.15 | _step_clipping: 0.11 | _step_step: 364.20 | _step_zero_grad: 0.52 | _step_check_overflow: 0.72 samples/sec: 16.326 | iteration 1790/ 143000 | elapsed time per iteration (ms): 62723.0 | learning rate: 6.000E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.543221E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 06:44:32,936] [INFO] [logging.py:60:log_dist] [Rank 0] step=1800, skipped=0, lr=[0.0005999900889427145, 0.0005999900889427145], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1800 loss: 3.4922 iter time (s): 63.503 samples/sec: 16.125 %comms: 0.002852325219539977 %optimizer_step 0.058175014782113885 %forward: 22.894757473064388 %backward: 61.45502997165801 [2025-03-26 06:44:32,937] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22012.65 | forward: 145388.70 | backward_microstep: 390268.29 | backward: 390258.21 | backward_inner_microstep: 390241.60 | backward_inner: 390234.80 | backward_allreduce_microstep: 7.92 | backward_allreduce: 2.71 | reduce_tied_grads: 0.36 | comms: 18.11 | reduce_grads: 0.22 | step: 369.43 | _step_clipping: 0.13 | _step_step: 367.58 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.125 | iteration 1800/ 143000 | elapsed time per iteration (ms): 63503.6 | learning rate: 6.000E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.519720E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 06:55:02,413] [INFO] [logging.py:60:log_dist] [Rank 0] step=1810, skipped=0, lr=[0.000599989545973409, 0.000599989545973409], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1810 loss: 3.5020 iter time (s): 62.947 samples/sec: 16.268 %comms: 0.0028646378156942124 %optimizer_step 0.05685502335381473 %forward: 23.08969312847303 %backward: 61.99568128805818 [2025-03-26 06:55:02,414] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16548.53 | forward: 145343.00 | backward_microstep: 390255.13 | backward: 390245.04 | backward_inner_microstep: 390228.78 | backward_inner: 390222.42 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.61 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.19 | step: 357.89 | _step_clipping: 0.15 | _step_step: 356.12 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.267 | iteration 1810/ 143000 | elapsed time per iteration (ms): 62947.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.516708E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 07:05:36,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=1820, skipped=0, lr=[0.0005999889885252627, 0.0005999889885252627], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1820 loss: 3.5164 iter time (s): 63.456 samples/sec: 16.137 %comms: 0.0028660704507724732 %optimizer_step 0.05669470736736886 %forward: 22.912049717176536 %backward: 61.45848830704499 [2025-03-26 07:05:36,983] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21888.21 | forward: 145391.58 | backward_microstep: 390003.00 | backward: 389993.33 | backward_inner_microstep: 389977.18 | backward_inner: 389970.94 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.67 | reduce_tied_grads: 0.32 | comms: 18.19 | reduce_grads: 0.20 | step: 359.76 | _step_clipping: 0.13 | _step_step: 357.96 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.137 | iteration 1820/ 143000 | elapsed time per iteration (ms): 63457.0 | learning rate: 6.000E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.518572E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 07:16:05,881] [INFO] [logging.py:60:log_dist] [Rank 0] step=1830, skipped=0, lr=[0.0005999884165983028, 0.0005999884165983028], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1830 loss: 3.5058 iter time (s): 62.889 samples/sec: 16.283 %comms: 0.0028514274948186116 %optimizer_step 0.05499220591178576 %forward: 23.092183548661733 %backward: 61.982615757107155 [2025-03-26 07:16:05,882] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16630.61 | forward: 145225.02 | backward_microstep: 389811.44 | backward: 389804.05 | backward_inner_microstep: 389788.51 | backward_inner: 389782.72 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.59 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.19 | step: 345.84 | _step_clipping: 0.13 | _step_step: 344.17 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.282 | iteration 1830/ 143000 | elapsed time per iteration (ms): 62889.8 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.515685E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 07:26:36,433] [INFO] [logging.py:60:log_dist] [Rank 0] step=1840, skipped=0, lr=[0.0005999878301925566, 0.0005999878301925566], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1840 loss: 3.5090 iter time (s): 63.055 samples/sec: 16.240 %comms: 0.0028470484802562332 %optimizer_step 0.05628834454178501 %forward: 23.03980467305431 %backward: 61.824127517813174 [2025-03-26 07:26:36,434] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18184.74 | forward: 145276.69 | backward_microstep: 389837.48 | backward: 389829.89 | backward_inner_microstep: 389814.07 | backward_inner: 389808.14 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 17.95 | reduce_grads: 0.20 | step: 354.92 | _step_clipping: 0.12 | _step_step: 353.19 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.240 | iteration 1840/ 143000 | elapsed time per iteration (ms): 63055.2 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.515873E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 07:37:07,646] [INFO] [logging.py:60:log_dist] [Rank 0] step=1850, skipped=0, lr=[0.0005999872293080525, 0.0005999872293080525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1850 loss: 3.4903 iter time (s): 63.121 samples/sec: 16.223 %comms: 0.0028364003175391952 %optimizer_step 0.05831578315751792 %forward: 23.0491148020222 %backward: 61.80517425072921 [2025-03-26 07:37:07,646] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18299.93 | forward: 145487.70 | backward_microstep: 390126.52 | backward: 390118.78 | backward_inner_microstep: 390102.69 | backward_inner: 390096.69 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.63 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.19 | step: 368.09 | _step_clipping: 0.11 | _step_step: 366.49 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.223 | iteration 1850/ 143000 | elapsed time per iteration (ms): 63121.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.503637E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 07:47:45,060] [INFO] [logging.py:60:log_dist] [Rank 0] step=1860, skipped=0, lr=[0.0005999866139448196, 0.0005999866139448196], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1860 loss: 3.4692 iter time (s): 63.741 samples/sec: 16.065 %comms: 0.002808016022002595 %optimizer_step 0.05454931700244998 %forward: 22.79609452923659 %backward: 61.194035560764725 [2025-03-26 07:47:45,061] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24797.44 | forward: 145304.46 | backward_microstep: 390063.87 | backward: 390056.57 | backward_inner_microstep: 390040.80 | backward_inner: 390034.82 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 347.70 | _step_clipping: 0.13 | _step_step: 346.01 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.065 | iteration 1860/ 143000 | elapsed time per iteration (ms): 63741.5 | learning rate: 6.000E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 3.484535E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 07:58:19,481] [INFO] [logging.py:60:log_dist] [Rank 0] step=1870, skipped=0, lr=[0.0005999859841028875, 0.0005999859841028875], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1870 loss: 3.4992 iter time (s): 63.441 samples/sec: 16.141 %comms: 0.0028251043469882165 %optimizer_step 0.055101372740427534 %forward: 22.89249160524719 %backward: 61.50618606943865 [2025-03-26 07:58:19,482] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21702.68 | forward: 145233.34 | backward_microstep: 390213.35 | backward: 390204.31 | backward_inner_microstep: 390188.37 | backward_inner: 390182.32 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.18 | step: 349.57 | _step_clipping: 0.13 | _step_step: 347.86 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.141 | iteration 1870/ 143000 | elapsed time per iteration (ms): 63442.0 | learning rate: 6.000E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.490890E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 08:08:51,079] [INFO] [logging.py:60:log_dist] [Rank 0] step=1880, skipped=0, lr=[0.0005999853397822867, 0.0005999853397822867], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1880 loss: 3.4893 iter time (s): 63.159 samples/sec: 16.213 %comms: 0.00286913848678836 %optimizer_step 0.055853337652369606 %forward: 23.001511950972144 %backward: 61.78284625825681 [2025-03-26 08:08:51,079] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18839.70 | forward: 145275.66 | backward_microstep: 390224.77 | backward: 390215.37 | backward_inner_microstep: 390199.33 | backward_inner: 390193.14 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.12 | reduce_grads: 0.19 | step: 352.77 | _step_clipping: 0.13 | _step_step: 350.96 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.213 | iteration 1880/ 143000 | elapsed time per iteration (ms): 63159.8 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.492136E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 08:19:23,034] [INFO] [logging.py:60:log_dist] [Rank 0] step=1890, skipped=0, lr=[0.0005999846809830482, 0.0005999846809830482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1890 loss: 3.4787 iter time (s): 63.195 samples/sec: 16.204 %comms: 0.0028810579214807526 %optimizer_step 0.05641516225966738 %forward: 22.996160918354487 %backward: 61.75856857684985 [2025-03-26 08:19:23,035] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19068.23 | forward: 145324.15 | backward_microstep: 390292.28 | backward: 390283.04 | backward_inner_microstep: 390267.14 | backward_inner: 390260.95 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.31 | comms: 18.21 | reduce_grads: 0.19 | step: 356.52 | _step_clipping: 0.13 | _step_step: 354.64 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.204 | iteration 1890/ 143000 | elapsed time per iteration (ms): 63195.5 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.478818E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 08:29:54,824] [INFO] [logging.py:60:log_dist] [Rank 0] step=1900, skipped=0, lr=[0.0005999840077052038, 0.0005999840077052038], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1900 loss: 3.4682 iter time (s): 63.178 samples/sec: 16.208 %comms: 0.00284845201395888 %optimizer_step 0.0566212529716347 %forward: 22.99653884569745 %backward: 61.76227348116131 [2025-03-26 08:29:54,824] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19040.36 | forward: 145288.52 | backward_microstep: 390215.54 | backward: 390204.35 | backward_inner_microstep: 390188.07 | backward_inner: 390181.81 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.65 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.20 | step: 357.72 | _step_clipping: 0.12 | _step_step: 355.88 | _step_zero_grad: 0.49 | _step_check_overflow: 0.64 samples/sec: 16.208 | iteration 1900/ 143000 | elapsed time per iteration (ms): 63179.0 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.474780E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 08:40:27,012] [INFO] [logging.py:60:log_dist] [Rank 0] step=1910, skipped=0, lr=[0.000599983319948786, 0.000599983319948786], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1910 loss: 3.4644 iter time (s): 63.218 samples/sec: 16.198 %comms: 0.0028543134131316654 %optimizer_step 0.055710175722355995 %forward: 23.000298661259727 %backward: 61.74758276605659 [2025-03-26 08:40:27,013] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19162.43 | forward: 145403.88 | backward_microstep: 390368.63 | backward: 390357.45 | backward_inner_microstep: 390336.60 | backward_inner: 390330.09 | backward_allreduce_microstep: 12.17 | backward_allreduce: 2.68 | reduce_tied_grads: 0.29 | comms: 18.04 | reduce_grads: 0.19 | step: 352.19 | _step_clipping: 0.11 | _step_step: 350.32 | _step_zero_grad: 0.52 | _step_check_overflow: 0.65 samples/sec: 16.198 | iteration 1910/ 143000 | elapsed time per iteration (ms): 63218.8 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.463579E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 08:50:58,374] [INFO] [logging.py:60:log_dist] [Rank 0] step=1920, skipped=0, lr=[0.0005999826177138281, 0.0005999826177138281], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1920 loss: 3.4531 iter time (s): 63.136 samples/sec: 16.219 %comms: 0.0028574113044475 %optimizer_step 0.056238138303823265 %forward: 23.004339509276907 %backward: 61.795825004686364 [2025-03-26 08:50:58,375] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18767.83 | forward: 145239.12 | backward_microstep: 390160.87 | backward: 390151.24 | backward_inner_microstep: 390134.82 | backward_inner: 390128.65 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.65 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.20 | step: 355.06 | _step_clipping: 0.12 | _step_step: 353.14 | _step_zero_grad: 0.50 | _step_check_overflow: 0.73 samples/sec: 16.219 | iteration 1920/ 143000 | elapsed time per iteration (ms): 63136.2 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.463926E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 09:01:32,077] [INFO] [logging.py:60:log_dist] [Rank 0] step=1930, skipped=0, lr=[0.0005999819010003637, 0.0005999819010003637], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1930 loss: 3.4551 iter time (s): 63.370 samples/sec: 16.159 %comms: 0.0028357144835865797 %optimizer_step 0.05465222549033009 %forward: 22.940499986077455 %backward: 61.57866675714345 [2025-03-26 09:01:32,078] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20913.24 | forward: 145373.33 | backward_microstep: 390232.64 | backward: 390222.36 | backward_inner_microstep: 390205.89 | backward_inner: 390199.57 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.18 | step: 346.33 | _step_clipping: 0.13 | _step_step: 344.65 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.159 | iteration 1930/ 143000 | elapsed time per iteration (ms): 63370.3 | learning rate: 6.000E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 3.453976E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 09:12:04,070] [INFO] [logging.py:60:log_dist] [Rank 0] step=1940, skipped=0, lr=[0.0005999811698084278, 0.0005999811698084278], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1940 loss: 3.4472 iter time (s): 63.199 samples/sec: 16.203 %comms: 0.002838106078074524 %optimizer_step 0.057456001545726026 %forward: 23.01020655808167 %backward: 61.72506678021424 [2025-03-26 09:12:04,071] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19334.69 | forward: 145421.57 | backward_microstep: 390102.83 | backward: 390094.54 | backward_inner_microstep: 390078.35 | backward_inner: 390072.21 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.68 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 363.11 | _step_clipping: 0.14 | _step_step: 361.37 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.203 | iteration 1940/ 143000 | elapsed time per iteration (ms): 63199.3 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.459666E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 09:22:35,757] [INFO] [logging.py:60:log_dist] [Rank 0] step=1950, skipped=0, lr=[0.0005999804241380555, 0.0005999804241380555], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1950 loss: 3.4624 iter time (s): 63.168 samples/sec: 16.211 %comms: 0.0028375590539802763 %optimizer_step 0.054649243546802104 %forward: 23.00223694842652 %backward: 61.76686005027927 [2025-03-26 09:22:35,757] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19109.99 | forward: 145300.66 | backward_microstep: 390177.22 | backward: 390169.24 | backward_inner_microstep: 390153.68 | backward_inner: 390147.71 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 345.21 | _step_clipping: 0.14 | _step_step: 343.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.211 | iteration 1950/ 143000 | elapsed time per iteration (ms): 63168.6 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.447003E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 09:33:13,289] [INFO] [logging.py:60:log_dist] [Rank 0] step=1960, skipped=0, lr=[0.0005999796639892826, 0.0005999796639892826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1960 loss: 3.4474 iter time (s): 63.753 samples/sec: 16.062 %comms: 0.002817448125272493 %optimizer_step 0.054571911083634864 %forward: 22.786299261288946 %backward: 61.192754565763174 [2025-03-26 09:33:13,289] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25065.41 | forward: 145268.70 | backward_microstep: 390128.18 | backward: 390120.04 | backward_inner_microstep: 390104.23 | backward_inner: 390098.10 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.77 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.19 | step: 347.91 | _step_clipping: 0.13 | _step_step: 346.07 | _step_zero_grad: 0.47 | _step_check_overflow: 0.69 samples/sec: 16.062 | iteration 1960/ 143000 | elapsed time per iteration (ms): 63753.2 | learning rate: 6.000E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 3.448488E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 09:43:44,531] [INFO] [logging.py:60:log_dist] [Rank 0] step=1970, skipped=0, lr=[0.000599978889362146, 0.000599978889362146], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1970 loss: 3.4407 iter time (s): 63.123 samples/sec: 16.222 %comms: 0.0028851381996608007 %optimizer_step 0.05585846774585082 %forward: 23.037251013189028 %backward: 61.78482821058625 [2025-03-26 09:43:44,532] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18790.40 | forward: 145417.91 | backward_microstep: 390012.54 | backward: 390004.02 | backward_inner_microstep: 389987.40 | backward_inner: 389981.27 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.20 | step: 352.60 | _step_clipping: 0.14 | _step_step: 350.75 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.222 | iteration 1970/ 143000 | elapsed time per iteration (ms): 63124.2 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.453467E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 09:54:17,775] [INFO] [logging.py:60:log_dist] [Rank 0] step=1980, skipped=0, lr=[0.0005999781002566832, 0.0005999781002566832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1980 loss: 3.4291 iter time (s): 63.324 samples/sec: 16.171 %comms: 0.002834191985334509 %optimizer_step 0.05486346736625049 %forward: 22.980542662456298 %backward: 61.60488927650657 [2025-03-26 09:54:17,775] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20589.75 | forward: 145521.66 | backward_microstep: 390115.72 | backward: 390105.92 | backward_inner_microstep: 390089.15 | backward_inner: 390082.81 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.79 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.19 | step: 347.42 | _step_clipping: 0.13 | _step_step: 345.74 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.171 | iteration 1980/ 143000 | elapsed time per iteration (ms): 63324.4 | learning rate: 6.000E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.436816E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 10:04:49,264] [INFO] [logging.py:60:log_dist] [Rank 0] step=1990, skipped=0, lr=[0.000599977296672932, 0.000599977296672932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 1990 loss: 3.4461 iter time (s): 63.148 samples/sec: 16.216 %comms: 0.0028417096127202296 %optimizer_step 0.05642352508121142 %forward: 23.047980647097464 %backward: 61.77697279880226 [2025-03-26 10:04:49,264] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18832.88 | forward: 145543.27 | backward_microstep: 390120.04 | backward: 390108.90 | backward_inner_microstep: 390092.55 | backward_inner: 390086.29 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 356.30 | _step_clipping: 0.12 | _step_step: 354.50 | _step_zero_grad: 0.54 | _step_check_overflow: 0.54 samples/sec: 16.216 | iteration 1990/ 143000 | elapsed time per iteration (ms): 63148.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.427035E+00 | loss scale: 8192.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 10:15:21,652] [INFO] [logging.py:60:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.0005999764786109314, 0.0005999764786109314], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2000 loss: 3.4171 iter time (s): 63.238 samples/sec: 16.193 %comms: 0.002850845180757687 %optimizer_step 0.055601321145809825 %forward: 22.99030952533907 %backward: 61.69684685638552 [2025-03-26 10:15:21,653] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19881.26 | forward: 145386.80 | backward_microstep: 390169.02 | backward: 390160.35 | backward_inner_microstep: 390144.08 | backward_inner: 390136.18 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.68 | reduce_tied_grads: 0.34 | comms: 18.03 | reduce_grads: 0.20 | step: 351.61 | _step_clipping: 0.13 | _step_step: 349.52 | _step_zero_grad: 0.49 | _step_check_overflow: 0.89 samples/sec: 16.193 | iteration 2000/ 143000 | elapsed time per iteration (ms): 63238.9 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.430408E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 10:15:24,513] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step2000/mp_rank_00_model_states.pt [2025-03-26 10:15:38,263] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-26 10:15:38,269] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-26 10:26:06,436] [INFO] [logging.py:60:log_dist] [Rank 0] step=2010, skipped=0, lr=[0.0005999756460707208, 0.0005999756460707208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2010 loss: 3.4162 iter time (s): 62.815 samples/sec: 16.302 %comms: 0.002847416826491442 %optimizer_step 0.055443972814579724 %forward: 23.13714206550958 %backward: 62.11778959211413 [2025-03-26 10:26:06,437] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15717.38 | forward: 145336.88 | backward_microstep: 390202.74 | backward: 390195.36 | backward_inner_microstep: 390179.03 | backward_inner: 390172.86 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.74 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.18 | step: 348.27 | _step_clipping: 0.12 | _step_step: 346.62 | _step_zero_grad: 0.45 | _step_check_overflow: 0.53 samples/sec: 15.881 | iteration 2010/ 143000 | elapsed time per iteration (ms): 64478.4 | learning rate: 6.000E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 3.427591E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 10:36:40,976] [INFO] [logging.py:60:log_dist] [Rank 0] step=2020, skipped=0, lr=[0.0005999747990523403, 0.0005999747990523403], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2020 loss: 3.3965 iter time (s): 63.453 samples/sec: 16.138 %comms: 0.002934098609955895 %optimizer_step 0.05628408894936229 %forward: 22.979792041623206 %backward: 61.59675999223666 [2025-03-26 10:36:40,977] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20892.19 | forward: 145814.71 | backward_microstep: 390867.85 | backward: 390852.70 | backward_inner_microstep: 390836.01 | backward_inner: 390829.56 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.66 | reduce_tied_grads: 0.32 | comms: 18.62 | reduce_grads: 0.19 | step: 357.14 | _step_clipping: 0.15 | _step_step: 355.17 | _step_zero_grad: 0.52 | _step_check_overflow: 0.70 samples/sec: 16.138 | iteration 2020/ 143000 | elapsed time per iteration (ms): 63454.0 | learning rate: 6.000E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.415467E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 10:47:17,414] [INFO] [logging.py:60:log_dist] [Rank 0] step=2030, skipped=0, lr=[0.000599973937555831, 0.000599973937555831], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2030 loss: 3.3965 iter time (s): 63.643 samples/sec: 16.090 %comms: 0.003158563039478237 %optimizer_step 0.05877956423049896 %forward: 22.832628584059332 %backward: 61.302361508755965 [2025-03-26 10:47:17,415] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24101.40 | forward: 145313.56 | backward_microstep: 390156.37 | backward: 390146.26 | backward_inner_microstep: 390129.87 | backward_inner: 390123.48 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.69 | reduce_tied_grads: 0.38 | comms: 20.10 | reduce_grads: 0.19 | step: 374.09 | _step_clipping: 0.15 | _step_step: 372.01 | _step_zero_grad: 0.60 | _step_check_overflow: 0.68 samples/sec: 16.090 | iteration 2030/ 143000 | elapsed time per iteration (ms): 63643.7 | learning rate: 6.000E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 3.394061E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 10:57:55,020] [INFO] [logging.py:60:log_dist] [Rank 0] step=2040, skipped=0, lr=[0.0005999730615812343, 0.0005999730615812343], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2040 loss: 3.4006 iter time (s): 63.760 samples/sec: 16.060 %comms: 0.002835891629288133 %optimizer_step 0.05500952945965469 %forward: 22.821776898573262 %backward: 61.20170219758305 [2025-03-26 10:57:55,021] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25021.09 | forward: 145511.81 | backward_microstep: 390232.21 | backward: 390222.49 | backward_inner_microstep: 390205.93 | backward_inner: 390199.62 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.72 | reduce_tied_grads: 0.32 | comms: 18.08 | reduce_grads: 0.20 | step: 350.74 | _step_clipping: 0.13 | _step_step: 348.96 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.060 | iteration 2040/ 143000 | elapsed time per iteration (ms): 63760.6 | learning rate: 6.000E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 3.399080E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 11:08:33,726] [INFO] [logging.py:60:log_dist] [Rank 0] step=2050, skipped=0, lr=[0.0005999721711285925, 0.0005999721711285925], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2050 loss: 3.3950 iter time (s): 63.870 samples/sec: 16.033 %comms: 0.0028320948908516926 %optimizer_step 0.05667321663780945 %forward: 22.82790278270119 %backward: 61.10973264749491 [2025-03-26 11:08:33,726] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25717.93 | forward: 145801.73 | backward_microstep: 390318.98 | backward: 390307.63 | backward_inner_microstep: 390291.15 | backward_inner: 390284.82 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.66 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.20 | step: 361.97 | _step_clipping: 0.17 | _step_step: 360.04 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.032 | iteration 2050/ 143000 | elapsed time per iteration (ms): 63870.5 | learning rate: 6.000E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 3.388056E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 11:19:04,356] [INFO] [logging.py:60:log_dist] [Rank 0] step=2060, skipped=0, lr=[0.0005999712661979486, 0.0005999712661979486], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2060 loss: 3.3620 iter time (s): 63.062 samples/sec: 16.238 %comms: 0.0029380742206530533 %optimizer_step 0.05618979518941429 %forward: 23.093385905937094 %backward: 61.861449158401015 [2025-03-26 11:19:04,357] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18096.44 | forward: 145632.61 | backward_microstep: 390127.05 | backward: 390113.61 | backward_inner_microstep: 390097.46 | backward_inner: 390091.19 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.53 | reduce_grads: 0.19 | step: 354.35 | _step_clipping: 0.16 | _step_step: 352.44 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 16.238 | iteration 2060/ 143000 | elapsed time per iteration (ms): 63063.1 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.373606E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 11:29:35,660] [INFO] [logging.py:60:log_dist] [Rank 0] step=2070, skipped=0, lr=[0.0005999703467893463, 0.0005999703467893463], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2070 loss: 3.3605 iter time (s): 63.129 samples/sec: 16.221 %comms: 0.0028443294027409873 %optimizer_step 0.05618892235226063 %forward: 23.0059241505326 %backward: 61.73683555256804 [2025-03-26 11:29:35,661] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19640.82 | forward: 145234.51 | backward_microstep: 389747.34 | backward: 389739.58 | backward_inner_microstep: 389723.62 | backward_inner: 389717.66 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.64 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.19 | step: 354.72 | _step_clipping: 0.14 | _step_step: 352.97 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.220 | iteration 2070/ 143000 | elapsed time per iteration (ms): 63130.4 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.370146E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 11:40:13,319] [INFO] [logging.py:60:log_dist] [Rank 0] step=2080, skipped=0, lr=[0.00059996941290283, 0.00059996941290283], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2080 loss: 3.3770 iter time (s): 63.765 samples/sec: 16.059 %comms: 0.0028328508117027836 %optimizer_step 0.05467370285114849 %forward: 22.817004809297153 %backward: 61.18919510820279 [2025-03-26 11:40:13,320] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25298.54 | forward: 145493.52 | backward_microstep: 390184.37 | backward: 390175.29 | backward_inner_microstep: 390159.11 | backward_inner: 390152.94 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.20 | step: 348.63 | _step_clipping: 0.13 | _step_step: 346.80 | _step_zero_grad: 0.50 | _step_check_overflow: 0.61 samples/sec: 16.059 | iteration 2080/ 143000 | elapsed time per iteration (ms): 63765.9 | learning rate: 6.000E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 3.371533E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 11:50:52,278] [INFO] [logging.py:60:log_dist] [Rank 0] step=2090, skipped=0, lr=[0.0005999684645384447, 0.0005999684645384447], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2090 loss: 3.3641 iter time (s): 63.895 samples/sec: 16.026 %comms: 0.0028117197023528368 %optimizer_step 0.054442801014413456 %forward: 22.74068950511981 %backward: 61.020836441151836 [2025-03-26 11:50:52,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27157.42 | forward: 145302.22 | backward_microstep: 389902.98 | backward: 389894.20 | backward_inner_microstep: 389878.32 | backward_inner: 389872.20 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.19 | step: 347.86 | _step_clipping: 0.14 | _step_step: 346.10 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.026 | iteration 2090/ 143000 | elapsed time per iteration (ms): 63895.8 | learning rate: 6.000E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 3.372324E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 12:01:25,638] [INFO] [logging.py:60:log_dist] [Rank 0] step=2100, skipped=0, lr=[0.0005999675016962362, 0.0005999675016962362], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2100 loss: 3.3568 iter time (s): 63.335 samples/sec: 16.168 %comms: 0.002876437449568 %optimizer_step 0.05575647925459533 %forward: 22.94305008228515 %backward: 61.56649509637545 [2025-03-26 12:01:25,638] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21519.53 | forward: 145310.80 | backward_microstep: 389942.38 | backward: 389934.05 | backward_inner_microstep: 389917.82 | backward_inner: 389911.55 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.22 | reduce_grads: 0.19 | step: 353.14 | _step_clipping: 0.12 | _step_step: 351.32 | _step_zero_grad: 0.49 | _step_check_overflow: 0.62 samples/sec: 16.168 | iteration 2100/ 143000 | elapsed time per iteration (ms): 63336.0 | learning rate: 6.000E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 3.363496E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 12:11:55,692] [INFO] [logging.py:60:log_dist] [Rank 0] step=2110, skipped=0, lr=[0.0005999665243762509, 0.0005999665243762509], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2110 loss: 3.3607 iter time (s): 63.005 samples/sec: 16.253 %comms: 0.002846425059673143 %optimizer_step 0.05571878946376821 %forward: 23.051557650165496 %backward: 61.88012152806116 [2025-03-26 12:11:55,692] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18434.90 | forward: 145235.89 | backward_microstep: 389882.71 | backward: 389874.50 | backward_inner_microstep: 389858.22 | backward_inner: 389852.13 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 17.93 | reduce_grads: 0.18 | step: 351.06 | _step_clipping: 0.13 | _step_step: 349.26 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 16.253 | iteration 2110/ 143000 | elapsed time per iteration (ms): 63005.4 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.357959E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 12:22:25,579] [INFO] [logging.py:60:log_dist] [Rank 0] step=2120, skipped=0, lr=[0.000599965532578536, 0.000599965532578536], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2120 loss: 3.3457 iter time (s): 62.988 samples/sec: 16.257 %comms: 0.0028801818385384586 %optimizer_step 0.055217868736244145 %forward: 23.049211501450287 %backward: 61.903687466504124 [2025-03-26 12:22:25,580] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18311.39 | forward: 145182.83 | backward_microstep: 389930.72 | backward: 389920.18 | backward_inner_microstep: 389904.15 | backward_inner: 389897.82 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.60 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.19 | step: 347.81 | _step_clipping: 0.13 | _step_step: 346.03 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.257 | iteration 2120/ 143000 | elapsed time per iteration (ms): 62988.8 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.361982E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 12:32:55,877] [INFO] [logging.py:60:log_dist] [Rank 0] step=2130, skipped=0, lr=[0.0005999645263031396, 0.0005999645263031396], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2130 loss: 3.3239 iter time (s): 63.029 samples/sec: 16.246 %comms: 0.0028614781130174042 %optimizer_step 0.05503058137354437 %forward: 23.031274328847925 %backward: 61.8570691852768 [2025-03-26 12:32:55,877] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18805.84 | forward: 145164.14 | backward_microstep: 389890.07 | backward: 389879.78 | backward_inner_microstep: 389863.66 | backward_inner: 389857.46 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.62 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.20 | step: 346.85 | _step_clipping: 0.14 | _step_step: 345.13 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.246 | iteration 2130/ 143000 | elapsed time per iteration (ms): 63029.7 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.343309E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 12:43:28,883] [INFO] [logging.py:60:log_dist] [Rank 0] step=2140, skipped=0, lr=[0.00059996350555011, 0.00059996350555011], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2140 loss: 3.3555 iter time (s): 63.300 samples/sec: 16.177 %comms: 0.002854390952174279 %optimizer_step 0.05549670625736121 %forward: 22.951677532933598 %backward: 61.58988837805575 [2025-03-26 12:43:28,884] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21493.63 | forward: 145284.27 | backward_microstep: 389873.18 | backward: 389864.40 | backward_inner_microstep: 389848.01 | backward_inner: 389841.74 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.72 | reduce_tied_grads: 0.33 | comms: 18.07 | reduce_grads: 0.19 | step: 351.29 | _step_clipping: 0.15 | _step_step: 349.32 | _step_zero_grad: 0.51 | _step_check_overflow: 0.74 samples/sec: 16.177 | iteration 2140/ 143000 | elapsed time per iteration (ms): 63300.7 | learning rate: 6.000E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.345115E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 12:53:59,087] [INFO] [logging.py:60:log_dist] [Rank 0] step=2150, skipped=0, lr=[0.0005999624703194966, 0.0005999624703194966], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2150 loss: 3.3642 iter time (s): 63.020 samples/sec: 16.249 %comms: 0.002857365849813438 %optimizer_step 0.057834769097596245 %forward: 23.066907674235466 %backward: 61.8716604657221 [2025-03-26 12:53:59,087] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18552.10 | forward: 145367.01 | backward_microstep: 389923.01 | backward: 389913.48 | backward_inner_microstep: 389897.42 | backward_inner: 389891.17 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.21 | step: 364.47 | _step_clipping: 0.14 | _step_step: 362.70 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.249 | iteration 2150/ 143000 | elapsed time per iteration (ms): 63020.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.348708E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 13:04:27,238] [INFO] [logging.py:60:log_dist] [Rank 0] step=2160, skipped=0, lr=[0.0005999614206113492, 0.0005999614206113492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2160 loss: 3.3176 iter time (s): 62.814 samples/sec: 16.302 %comms: 0.0028720164046364435 %optimizer_step 0.05735170084036224 %forward: 23.132588246844083 %backward: 62.07855738371938 [2025-03-26 13:04:27,238] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16587.12 | forward: 145306.13 | backward_microstep: 389954.67 | backward: 389943.16 | backward_inner_microstep: 389926.70 | backward_inner: 389920.20 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.63 | reduce_tied_grads: 0.35 | comms: 18.04 | reduce_grads: 0.20 | step: 360.25 | _step_clipping: 0.12 | _step_step: 358.20 | _step_zero_grad: 0.72 | _step_check_overflow: 0.60 samples/sec: 16.302 | iteration 2160/ 143000 | elapsed time per iteration (ms): 62815.1 | learning rate: 6.000E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.337144E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 13:14:59,137] [INFO] [logging.py:60:log_dist] [Rank 0] step=2170, skipped=0, lr=[0.0005999603564257187, 0.0005999603564257187], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2170 loss: 3.3308 iter time (s): 63.189 samples/sec: 16.205 %comms: 0.0028569783173128254 %optimizer_step 0.05653379789823146 %forward: 23.007510165540683 %backward: 61.71455390005421 [2025-03-26 13:14:59,137] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20275.55 | forward: 145382.92 | backward_microstep: 389979.57 | backward: 389970.14 | backward_inner_microstep: 389951.74 | backward_inner: 389945.28 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.21 | step: 357.23 | _step_clipping: 0.13 | _step_step: 355.51 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.205 | iteration 2170/ 143000 | elapsed time per iteration (ms): 63189.9 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.331150E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 13:25:31,805] [INFO] [logging.py:60:log_dist] [Rank 0] step=2180, skipped=0, lr=[0.0005999592777626561, 0.0005999592777626561], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2180 loss: 3.3541 iter time (s): 63.266 samples/sec: 16.186 %comms: 0.002877810962719025 %optimizer_step 0.05764297000192426 %forward: 23.092678553121697 %backward: 61.680874932344544 [2025-03-26 13:25:31,805] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20078.18 | forward: 146098.75 | backward_microstep: 390242.43 | backward: 390231.85 | backward_inner_microstep: 390215.27 | backward_inner: 390208.91 | backward_allreduce_microstep: 7.88 | backward_allreduce: 2.73 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.23 | step: 364.69 | _step_clipping: 0.13 | _step_step: 362.81 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.185 | iteration 2180/ 143000 | elapsed time per iteration (ms): 63266.8 | learning rate: 6.000E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.331690E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 13:36:02,937] [INFO] [logging.py:60:log_dist] [Rank 0] step=2190, skipped=0, lr=[0.0005999581846222138, 0.0005999581846222138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2190 loss: 3.3263 iter time (s): 63.113 samples/sec: 16.225 %comms: 0.0028901789491617828 %optimizer_step 0.057603324907013774 %forward: 23.077018404872383 %backward: 61.812833128685895 [2025-03-26 13:36:02,938] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19201.70 | forward: 145645.22 | backward_microstep: 390128.18 | backward: 390117.28 | backward_inner_microstep: 390100.89 | backward_inner: 390094.31 | backward_allreduce_microstep: 7.79 | backward_allreduce: 2.71 | reduce_tied_grads: 0.32 | comms: 18.24 | reduce_grads: 0.21 | step: 363.55 | _step_clipping: 0.14 | _step_step: 361.60 | _step_zero_grad: 0.53 | _step_check_overflow: 0.66 samples/sec: 16.225 | iteration 2190/ 143000 | elapsed time per iteration (ms): 63113.2 | learning rate: 6.000E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.320713E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 13:46:32,280] [INFO] [logging.py:60:log_dist] [Rank 0] step=2200, skipped=0, lr=[0.0005999570770044446, 0.0005999570770044446], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2200 loss: 3.3098 iter time (s): 62.934 samples/sec: 16.271 %comms: 0.0028533143365443137 %optimizer_step 0.05611929868740873 %forward: 23.09344772846864 %backward: 61.962638611079754 [2025-03-26 13:46:32,281] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17953.12 | forward: 145335.69 | backward_microstep: 389963.71 | backward: 389954.01 | backward_inner_microstep: 389937.82 | backward_inner: 389931.61 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.65 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.20 | step: 353.18 | _step_clipping: 0.11 | _step_step: 351.30 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.271 | iteration 2200/ 143000 | elapsed time per iteration (ms): 62934.3 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.316905E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 13:57:01,116] [INFO] [logging.py:60:log_dist] [Rank 0] step=2210, skipped=0, lr=[0.0005999559549094018, 0.0005999559549094018], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2210 loss: 3.3339 iter time (s): 62.883 samples/sec: 16.284 %comms: 0.002869644504001316 %optimizer_step 0.0560944277902205 %forward: 23.09863397619252 %backward: 61.9924270928173 [2025-03-26 13:57:01,116] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17722.44 | forward: 145251.15 | backward_microstep: 389836.03 | backward: 389827.01 | backward_inner_microstep: 389810.89 | backward_inner: 389804.78 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.64 | reduce_tied_grads: 0.28 | comms: 18.05 | reduce_grads: 0.19 | step: 352.74 | _step_clipping: 0.11 | _step_step: 351.06 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.284 | iteration 2210/ 143000 | elapsed time per iteration (ms): 62883.5 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.325393E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 14:07:30,314] [INFO] [logging.py:60:log_dist] [Rank 0] step=2220, skipped=0, lr=[0.0005999548183371395, 0.0005999548183371395], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2220 loss: 3.2862 iter time (s): 62.919 samples/sec: 16.275 %comms: 0.002859650522112345 %optimizer_step 0.05664015553722061 %forward: 23.098468683992408 %backward: 61.97204250259083 [2025-03-26 14:07:30,315] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17968.12 | forward: 145334.06 | backward_microstep: 389934.54 | backward: 389924.06 | backward_inner_microstep: 389907.56 | backward_inner: 389901.18 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.20 | step: 356.38 | _step_clipping: 0.13 | _step_step: 354.66 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.275 | iteration 2220/ 143000 | elapsed time per iteration (ms): 62919.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.309917E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 14:17:59,195] [INFO] [logging.py:60:log_dist] [Rank 0] step=2230, skipped=0, lr=[0.0005999536672877126, 0.0005999536672877126], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2230 loss: 3.3083 iter time (s): 62.888 samples/sec: 16.283 %comms: 0.0028717894300551022 %optimizer_step 0.056349081552519234 %forward: 23.105264094737993 %backward: 62.01072249644197 [2025-03-26 14:17:59,196] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17682.29 | forward: 145303.26 | backward_microstep: 389978.77 | backward: 389970.00 | backward_inner_microstep: 389953.51 | backward_inner: 389947.32 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.20 | step: 354.37 | _step_clipping: 0.11 | _step_step: 352.48 | _step_zero_grad: 0.52 | _step_check_overflow: 0.67 samples/sec: 16.283 | iteration 2230/ 143000 | elapsed time per iteration (ms): 62888.1 | learning rate: 6.000E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.308893E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 14:28:30,720] [INFO] [logging.py:60:log_dist] [Rank 0] step=2240, skipped=0, lr=[0.0005999525017611766, 0.0005999525017611766], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2240 loss: 3.2990 iter time (s): 63.152 samples/sec: 16.215 %comms: 0.0028477982309251023 %optimizer_step 0.05778861110783089 %forward: 23.106671395816836 %backward: 61.7786738489463 [2025-03-26 14:28:30,720] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19522.91 | forward: 145923.05 | backward_microstep: 390154.57 | backward: 390144.14 | backward_inner_microstep: 390125.96 | backward_inner: 390119.62 | backward_allreduce_microstep: 9.54 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.20 | step: 364.95 | _step_clipping: 0.13 | _step_step: 363.24 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.215 | iteration 2240/ 143000 | elapsed time per iteration (ms): 63152.5 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.308979E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 14:39:00,988] [INFO] [logging.py:60:log_dist] [Rank 0] step=2250, skipped=0, lr=[0.000599951321757588, 0.000599951321757588], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2250 loss: 3.2985 iter time (s): 63.026 samples/sec: 16.247 %comms: 0.002863994542433131 %optimizer_step 0.058088701670373916 %forward: 23.113637087467215 %backward: 61.88100770802879 [2025-03-26 14:39:00,989] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18720.65 | forward: 145676.48 | backward_microstep: 390023.15 | backward: 390012.51 | backward_inner_microstep: 389995.68 | backward_inner: 389989.21 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 18.05 | reduce_grads: 0.20 | step: 366.11 | _step_clipping: 0.11 | _step_step: 364.29 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.247 | iteration 2250/ 143000 | elapsed time per iteration (ms): 63026.9 | learning rate: 6.000E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.294440E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 14:49:33,050] [INFO] [logging.py:60:log_dist] [Rank 0] step=2260, skipped=0, lr=[0.0005999501272770035, 0.0005999501272770035], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2260 loss: 3.3640 iter time (s): 63.206 samples/sec: 16.201 %comms: 0.0028961169394661375 %optimizer_step 0.05848577964693382 %forward: 22.995178269139675 %backward: 61.71629539707012 [2025-03-26 14:49:33,050] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20831.33 | forward: 145342.26 | backward_microstep: 390091.76 | backward: 390081.17 | backward_inner_microstep: 390064.44 | backward_inner: 390058.00 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.73 | reduce_tied_grads: 0.36 | comms: 18.31 | reduce_grads: 0.23 | step: 369.66 | _step_clipping: 0.15 | _step_step: 367.66 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 16.201 | iteration 2260/ 143000 | elapsed time per iteration (ms): 63206.2 | learning rate: 6.000E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.329238E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 15:00:03,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=2270, skipped=0, lr=[0.0005999489183194809, 0.0005999489183194809], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2270 loss: 3.3182 iter time (s): 63.002 samples/sec: 16.253 %comms: 0.0028512982401452426 %optimizer_step 0.054741096526639506 %forward: 23.0544460534214 %backward: 61.918809725750215 [2025-03-26 15:00:03,081] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18380.68 | forward: 145248.73 | backward_microstep: 390114.24 | backward: 390103.86 | backward_inner_microstep: 390087.73 | backward_inner: 390081.49 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.62 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.19 | step: 344.88 | _step_clipping: 0.14 | _step_step: 343.16 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.253 | iteration 2270/ 143000 | elapsed time per iteration (ms): 63003.0 | learning rate: 5.999E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.339742E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 15:10:30,520] [INFO] [logging.py:60:log_dist] [Rank 0] step=2280, skipped=0, lr=[0.0005999476948850784, 0.0005999476948850784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2280 loss: 3.2938 iter time (s): 62.743 samples/sec: 16.320 %comms: 0.002864783005538594 %optimizer_step 0.055781285754154655 %forward: 23.14480028202953 %backward: 62.12234593150087 [2025-03-26 15:10:30,520] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16808.19 | forward: 145218.29 | backward_microstep: 389785.13 | backward: 389776.56 | backward_inner_microstep: 389760.14 | backward_inner: 389754.02 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.74 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 349.99 | _step_clipping: 0.13 | _step_step: 348.25 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.320 | iteration 2280/ 143000 | elapsed time per iteration (ms): 62743.9 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.293483E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 15:20:57,119] [INFO] [logging.py:60:log_dist] [Rank 0] step=2290, skipped=0, lr=[0.0005999464569738552, 0.0005999464569738552], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2290 loss: 3.2797 iter time (s): 62.659 samples/sec: 16.342 %comms: 0.002860559246109315 %optimizer_step 0.05531591555110256 %forward: 23.17665129773924 %backward: 62.2116409150172 [2025-03-26 15:20:57,119] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15939.77 | forward: 145223.33 | backward_microstep: 389823.35 | backward: 389813.93 | backward_inner_microstep: 389797.55 | backward_inner: 389789.54 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.73 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.20 | step: 346.61 | _step_clipping: 0.11 | _step_step: 344.93 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.342 | iteration 2290/ 143000 | elapsed time per iteration (ms): 62659.9 | learning rate: 5.999E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.289840E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 15:31:27,019] [INFO] [logging.py:60:log_dist] [Rank 0] step=2300, skipped=0, lr=[0.000599945204585871, 0.000599945204585871], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2300 loss: 3.2786 iter time (s): 62.989 samples/sec: 16.257 %comms: 0.0028566954981011907 %optimizer_step 0.05628904755807649 %forward: 23.041637714077538 %backward: 61.87240184939241 [2025-03-26 15:31:27,019] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19508.51 | forward: 145137.99 | backward_microstep: 389739.71 | backward: 389730.80 | backward_inner_microstep: 389713.86 | backward_inner: 389707.51 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 354.56 | _step_clipping: 0.12 | _step_step: 352.91 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.257 | iteration 2300/ 143000 | elapsed time per iteration (ms): 62990.0 | learning rate: 5.999E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.283464E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 15:41:58,199] [INFO] [logging.py:60:log_dist] [Rank 0] step=2310, skipped=0, lr=[0.0005999439377211862, 0.0005999439377211862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2310 loss: 3.2883 iter time (s): 63.118 samples/sec: 16.224 %comms: 0.0033487195384470573 %optimizer_step 0.05718294062997669 %forward: 23.01677079026794 %backward: 61.74625829470827 [2025-03-26 15:41:58,200] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20713.38 | forward: 145276.13 | backward_microstep: 389735.30 | backward: 389727.01 | backward_inner_microstep: 389710.66 | backward_inner: 389704.47 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.70 | reduce_tied_grads: 0.31 | comms: 21.14 | reduce_grads: 0.20 | step: 360.92 | _step_clipping: 0.15 | _step_step: 359.03 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 16.224 | iteration 2310/ 143000 | elapsed time per iteration (ms): 63118.1 | learning rate: 5.999E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.281193E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 15:52:26,452] [INFO] [logging.py:60:log_dist] [Rank 0] step=2320, skipped=0, lr=[0.000599942656379862, 0.000599942656379862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2320 loss: 3.2719 iter time (s): 62.825 samples/sec: 16.299 %comms: 0.002854624279705888 %optimizer_step 0.0563378152437338 %forward: 23.117700678877213 %backward: 62.055938674854325 [2025-03-26 15:52:26,452] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17722.68 | forward: 145236.20 | backward_microstep: 389873.86 | backward: 389864.41 | backward_inner_microstep: 389847.79 | backward_inner: 389841.37 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.19 | step: 353.94 | _step_clipping: 0.13 | _step_step: 352.21 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.299 | iteration 2320/ 143000 | elapsed time per iteration (ms): 62825.2 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.278809E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 16:02:55,288] [INFO] [logging.py:60:log_dist] [Rank 0] step=2330, skipped=0, lr=[0.0005999413605619602, 0.0005999413605619602], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2330 loss: 3.2734 iter time (s): 62.883 samples/sec: 16.284 %comms: 0.0029062266223106995 %optimizer_step 0.057793007559903325 %forward: 23.099451276720913 %backward: 62.000787919091195 [2025-03-26 16:02:55,289] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18305.24 | forward: 145256.57 | backward_microstep: 389890.59 | backward: 389880.32 | backward_inner_microstep: 389863.93 | backward_inner: 389857.53 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.67 | reduce_tied_grads: 0.36 | comms: 18.28 | reduce_grads: 0.23 | step: 363.42 | _step_clipping: 0.14 | _step_step: 361.62 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.284 | iteration 2330/ 143000 | elapsed time per iteration (ms): 62883.7 | learning rate: 5.999E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.276080E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 16:13:25,665] [INFO] [logging.py:60:log_dist] [Rank 0] step=2340, skipped=0, lr=[0.0005999400502675434, 0.0005999400502675434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2340 loss: 3.2701 iter time (s): 63.037 samples/sec: 16.244 %comms: 0.002849846274226096 %optimizer_step 0.05767499744306583 %forward: 23.075481193549795 %backward: 61.87230768546599 [2025-03-26 16:13:25,666] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19567.94 | forward: 145461.12 | backward_microstep: 390034.32 | backward: 390025.02 | backward_inner_microstep: 390006.72 | backward_inner: 390000.38 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.75 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.19 | step: 363.57 | _step_clipping: 0.11 | _step_step: 361.88 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.244 | iteration 2340/ 143000 | elapsed time per iteration (ms): 63037.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.274400E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 16:23:56,354] [INFO] [logging.py:60:log_dist] [Rank 0] step=2350, skipped=0, lr=[0.0005999387254966747, 0.0005999387254966747], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2350 loss: 3.2672 iter time (s): 63.068 samples/sec: 16.236 %comms: 0.0028487371333036538 %optimizer_step 0.05775061592169531 %forward: 23.11057303606095 %backward: 61.84681930609535 [2025-03-26 16:23:56,355] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19589.91 | forward: 145754.52 | backward_microstep: 390067.89 | backward: 390057.55 | backward_inner_microstep: 390040.91 | backward_inner: 390034.61 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.78 | reduce_tied_grads: 0.33 | comms: 17.97 | reduce_grads: 0.19 | step: 364.22 | _step_clipping: 0.13 | _step_step: 362.34 | _step_zero_grad: 0.47 | _step_check_overflow: 0.70 samples/sec: 16.236 | iteration 2350/ 143000 | elapsed time per iteration (ms): 63068.9 | learning rate: 5.999E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.266541E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 16:34:25,079] [INFO] [logging.py:60:log_dist] [Rank 0] step=2360, skipped=0, lr=[0.0005999373862494182, 0.0005999373862494182], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2360 loss: 3.2733 iter time (s): 62.872 samples/sec: 16.287 %comms: 0.002857215928177675 %optimizer_step 0.05632301972124638 %forward: 23.1488093792076 %backward: 62.024697896437964 [2025-03-26 16:34:25,080] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18010.29 | forward: 145541.18 | backward_microstep: 389970.12 | backward: 389961.64 | backward_inner_microstep: 389945.59 | backward_inner: 389939.42 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.21 | step: 354.11 | _step_clipping: 0.12 | _step_step: 352.41 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.287 | iteration 2360/ 143000 | elapsed time per iteration (ms): 62872.5 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.265633E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 16:44:50,427] [INFO] [logging.py:60:log_dist] [Rank 0] step=2370, skipped=0, lr=[0.0005999360325258385, 0.0005999360325258385], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2370 loss: 3.2686 iter time (s): 62.534 samples/sec: 16.375 %comms: 0.002857665424335388 %optimizer_step 0.0557018098092716 %forward: 23.196358539777766 %backward: 62.32416865052005 [2025-03-26 16:44:50,428] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15450.59 | forward: 145056.62 | backward_microstep: 389745.99 | backward: 389739.33 | backward_inner_microstep: 389724.13 | backward_inner: 389718.43 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.25 | comms: 17.87 | reduce_grads: 0.18 | step: 348.33 | _step_clipping: 0.11 | _step_step: 346.74 | _step_zero_grad: 0.44 | _step_check_overflow: 0.50 samples/sec: 16.375 | iteration 2370/ 143000 | elapsed time per iteration (ms): 62534.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.266066E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 16:55:15,815] [INFO] [logging.py:60:log_dist] [Rank 0] step=2380, skipped=0, lr=[0.0005999346643260009, 0.0005999346643260009], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2380 loss: 3.2636 iter time (s): 62.538 samples/sec: 16.374 %comms: 0.002871470247580887 %optimizer_step 0.056030802223222195 %forward: 23.196662529932286 %backward: 62.32154455747191 [2025-03-26 16:55:15,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15527.11 | forward: 145067.99 | backward_microstep: 389755.31 | backward: 389748.36 | backward_inner_microstep: 389732.89 | backward_inner: 389727.18 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.53 | reduce_tied_grads: 0.26 | comms: 17.96 | reduce_grads: 0.19 | step: 350.41 | _step_clipping: 0.12 | _step_step: 348.80 | _step_zero_grad: 0.45 | _step_check_overflow: 0.49 samples/sec: 16.374 | iteration 2380/ 143000 | elapsed time per iteration (ms): 62538.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.260255E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 17:05:46,567] [INFO] [logging.py:60:log_dist] [Rank 0] step=2390, skipped=0, lr=[0.0005999332816499716, 0.0005999332816499716], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2390 loss: 3.2481 iter time (s): 63.075 samples/sec: 16.235 %comms: 0.00284886747104283 %optimizer_step 0.05778013372387654 %forward: 23.02028626257077 %backward: 61.80452469841432 [2025-03-26 17:05:46,568] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20705.02 | forward: 145199.65 | backward_microstep: 389838.42 | backward: 389829.87 | backward_inner_microstep: 389814.02 | backward_inner: 389807.81 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 364.45 | _step_clipping: 0.14 | _step_step: 362.60 | _step_zero_grad: 0.48 | _step_check_overflow: 0.66 samples/sec: 16.235 | iteration 2390/ 143000 | elapsed time per iteration (ms): 63075.2 | learning rate: 5.999E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.246436E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 17:16:21,672] [INFO] [logging.py:60:log_dist] [Rank 0] step=2400, skipped=0, lr=[0.0005999318844978168, 0.0005999318844978168], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2400 loss: 3.2558 iter time (s): 63.509 samples/sec: 16.124 %comms: 0.0028553525936486227 %optimizer_step 0.05519975442526129 %forward: 22.90704692412009 %backward: 61.41280157662254 [2025-03-26 17:16:21,673] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24632.51 | forward: 145480.83 | backward_microstep: 390038.26 | backward: 390027.82 | backward_inner_microstep: 390011.31 | backward_inner: 390004.97 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 18.13 | reduce_grads: 0.20 | step: 350.57 | _step_clipping: 0.19 | _step_step: 348.74 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.123 | iteration 2400/ 143000 | elapsed time per iteration (ms): 63510.5 | learning rate: 5.999E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.249962E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 17:26:49,027] [INFO] [logging.py:60:log_dist] [Rank 0] step=2410, skipped=0, lr=[0.0005999304728696047, 0.0005999304728696047], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2410 loss: 3.2456 iter time (s): 62.735 samples/sec: 16.323 %comms: 0.0028856146791468415 %optimizer_step 0.054921993231850416 %forward: 23.154525509759736 %backward: 62.16096648379016 [2025-03-26 17:26:49,027] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17257.12 | forward: 145259.76 | backward_microstep: 389975.51 | backward: 389966.41 | backward_inner_microstep: 389950.13 | backward_inner: 389943.93 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.67 | reduce_tied_grads: 0.28 | comms: 18.10 | reduce_grads: 0.19 | step: 344.55 | _step_clipping: 0.14 | _step_step: 342.82 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.323 | iteration 2410/ 143000 | elapsed time per iteration (ms): 62735.5 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.253920E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 17:37:23,539] [INFO] [logging.py:60:log_dist] [Rank 0] step=2420, skipped=0, lr=[0.0005999290467654029, 0.0005999290467654029], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2420 loss: 3.2348 iter time (s): 63.451 samples/sec: 16.139 %comms: 0.002845251264948583 %optimizer_step 0.05469198111703519 %forward: 22.886654828085213 %backward: 61.45558105275425 [2025-03-26 17:37:23,539] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24526.29 | forward: 145217.22 | backward_microstep: 389947.30 | backward: 389939.42 | backward_inner_microstep: 389923.29 | backward_inner: 389917.20 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.19 | step: 347.02 | _step_clipping: 0.13 | _step_step: 345.28 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.138 | iteration 2420/ 143000 | elapsed time per iteration (ms): 63451.2 | learning rate: 5.999E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.253065E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 17:47:52,144] [INFO] [logging.py:60:log_dist] [Rank 0] step=2430, skipped=0, lr=[0.0005999276061852803, 0.0005999276061852803], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2430 loss: 3.2275 iter time (s): 62.860 samples/sec: 16.290 %comms: 0.0028666408544078363 %optimizer_step 0.05602450920734244 %forward: 23.093043673518434 %backward: 62.03195188846031 [2025-03-26 17:47:52,144] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18724.19 | forward: 145162.65 | backward_microstep: 389941.78 | backward: 389932.24 | backward_inner_microstep: 389916.04 | backward_inner: 389909.91 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.65 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.18 | step: 352.17 | _step_clipping: 0.15 | _step_step: 350.37 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.290 | iteration 2430/ 143000 | elapsed time per iteration (ms): 62860.5 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.241866E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 17:58:26,753] [INFO] [logging.py:60:log_dist] [Rank 0] step=2440, skipped=0, lr=[0.0005999261511293066, 0.0005999261511293066], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2440 loss: 3.2623 iter time (s): 63.460 samples/sec: 16.136 %comms: 0.002864534544756962 %optimizer_step 0.055631615724029054 %forward: 22.889858892083826 %backward: 61.435478296657685 [2025-03-26 17:58:26,754] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24745.29 | forward: 145260.06 | backward_microstep: 389883.19 | backward: 389872.26 | backward_inner_microstep: 389856.04 | backward_inner: 389848.29 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.67 | reduce_tied_grads: 0.32 | comms: 18.18 | reduce_grads: 0.20 | step: 353.04 | _step_clipping: 0.14 | _step_step: 351.04 | _step_zero_grad: 0.50 | _step_check_overflow: 0.76 samples/sec: 16.136 | iteration 2440/ 143000 | elapsed time per iteration (ms): 63461.0 | learning rate: 5.999E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 3.244534E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 18:08:55,466] [INFO] [logging.py:60:log_dist] [Rank 0] step=2450, skipped=0, lr=[0.000599924681597552, 0.000599924681597552], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2450 loss: 3.2402 iter time (s): 62.871 samples/sec: 16.287 %comms: 0.0028543948759541422 %optimizer_step 0.05639153438493715 %forward: 23.10463633379389 %backward: 62.015397186168705 [2025-03-26 18:08:55,467] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18891.07 | forward: 145260.35 | backward_microstep: 389904.11 | backward: 389894.84 | backward_inner_microstep: 389877.27 | backward_inner: 389871.17 | backward_allreduce_microstep: 9.22 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.18 | step: 354.54 | _step_clipping: 0.13 | _step_step: 352.82 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.287 | iteration 2450/ 143000 | elapsed time per iteration (ms): 62871.2 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.242076E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 18:19:23,508] [INFO] [logging.py:60:log_dist] [Rank 0] step=2460, skipped=0, lr=[0.0005999231975900873, 0.0005999231975900873], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2460 loss: 3.2289 iter time (s): 62.804 samples/sec: 16.305 %comms: 0.002885836077632738 %optimizer_step 0.05621587268358152 %forward: 23.112970502468176 %backward: 62.07685375525862 [2025-03-26 18:19:23,508] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18405.65 | forward: 145157.89 | backward_microstep: 389875.19 | backward: 389865.29 | backward_inner_microstep: 389849.71 | backward_inner: 389843.70 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.52 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.21 | step: 353.06 | _step_clipping: 0.16 | _step_step: 351.11 | _step_zero_grad: 0.51 | _step_check_overflow: 0.69 samples/sec: 16.305 | iteration 2460/ 143000 | elapsed time per iteration (ms): 62804.2 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.238621E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 18:29:50,892] [INFO] [logging.py:60:log_dist] [Rank 0] step=2470, skipped=0, lr=[0.0005999216991069842, 0.0005999216991069842], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2470 loss: 3.2306 iter time (s): 62.738 samples/sec: 16.322 %comms: 0.002867581562829732 %optimizer_step 0.056017482645098676 %forward: 23.174595140543794 %backward: 62.16582057885627 [2025-03-26 18:29:50,893] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17402.48 | forward: 145392.42 | backward_microstep: 390025.21 | backward: 390014.97 | backward_inner_microstep: 389998.92 | backward_inner: 389992.66 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 351.44 | _step_clipping: 0.14 | _step_step: 349.60 | _step_zero_grad: 0.47 | _step_check_overflow: 0.65 samples/sec: 16.322 | iteration 2470/ 143000 | elapsed time per iteration (ms): 62738.4 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.234676E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 18:40:24,284] [INFO] [logging.py:60:log_dist] [Rank 0] step=2480, skipped=0, lr=[0.000599920186148315, 0.000599920186148315], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2480 loss: 3.2132 iter time (s): 63.339 samples/sec: 16.167 %comms: 0.0028383536035140387 %optimizer_step 0.05560427459048988 %forward: 22.94694091785954 %backward: 61.56871712739087 [2025-03-26 18:40:24,284] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23582.28 | forward: 145342.55 | backward_microstep: 389975.71 | backward: 389967.20 | backward_inner_microstep: 389951.25 | backward_inner: 389945.19 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.62 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.20 | step: 352.19 | _step_clipping: 0.13 | _step_step: 350.44 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.167 | iteration 2480/ 143000 | elapsed time per iteration (ms): 63339.1 | learning rate: 5.999E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 3.222786E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 18:50:50,042] [INFO] [logging.py:60:log_dist] [Rank 0] step=2490, skipped=0, lr=[0.0005999186587141527, 0.0005999186587141527], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2490 loss: 3.2375 iter time (s): 62.575 samples/sec: 16.364 %comms: 0.0028841747527083524 %optimizer_step 0.05636752221758228 %forward: 23.226489871232474 %backward: 62.35362684062632 [2025-03-26 18:50:50,043] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15761.33 | forward: 145340.45 | backward_microstep: 390188.13 | backward: 390179.68 | backward_inner_microstep: 390161.06 | backward_inner: 390154.70 | backward_allreduce_microstep: 8.59 | backward_allreduce: 3.55 | reduce_tied_grads: 0.28 | comms: 18.05 | reduce_grads: 0.18 | step: 352.72 | _step_clipping: 0.14 | _step_step: 350.99 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.364 | iteration 2490/ 143000 | elapsed time per iteration (ms): 62575.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.227915E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 19:01:20,540] [INFO] [logging.py:60:log_dist] [Rank 0] step=2500, skipped=0, lr=[0.0005999171168045711, 0.0005999171168045711], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2500 loss: 3.2408 iter time (s): 63.049 samples/sec: 16.241 %comms: 0.00285232278638949 %optimizer_step 0.05531219930612977 %forward: 23.028281958543115 %backward: 61.83504472530351 [2025-03-26 19:01:20,541] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21054.22 | forward: 145191.54 | backward_microstep: 389872.26 | backward: 389865.18 | backward_inner_microstep: 389849.58 | backward_inner: 389843.66 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.59 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.20 | step: 348.74 | _step_clipping: 0.13 | _step_step: 347.00 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.241 | iteration 2500/ 143000 | elapsed time per iteration (ms): 63049.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.225502E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 19:11:45,771] [INFO] [logging.py:60:log_dist] [Rank 0] step=2510, skipped=0, lr=[0.0005999155604196445, 0.0005999155604196445], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2510 loss: 3.2409 iter time (s): 62.523 samples/sec: 16.378 %comms: 0.0028678086834411348 %optimizer_step 0.055324702729484095 %forward: 23.224273493284358 %backward: 62.35785158381509 [2025-03-26 19:11:45,771] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15800.86 | forward: 145204.06 | backward_microstep: 389885.34 | backward: 389877.15 | backward_inner_microstep: 389857.94 | backward_inner: 389851.88 | backward_allreduce_microstep: 11.01 | backward_allreduce: 6.12 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.20 | step: 345.90 | _step_clipping: 0.12 | _step_step: 344.27 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.378 | iteration 2510/ 143000 | elapsed time per iteration (ms): 62523.1 | learning rate: 5.999E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.228913E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 19:22:13,547] [INFO] [logging.py:60:log_dist] [Rank 0] step=2520, skipped=0, lr=[0.0005999139895594482, 0.0005999139895594482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2520 loss: 3.2166 iter time (s): 62.777 samples/sec: 16.312 %comms: 0.002846458050207226 %optimizer_step 0.0551317098493565 %forward: 23.125978035574526 %backward: 62.123950287889464 [2025-03-26 19:22:13,547] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18307.51 | forward: 145178.15 | backward_microstep: 390003.69 | backward: 389996.05 | backward_inner_microstep: 389980.62 | backward_inner: 389974.81 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 346.10 | _step_clipping: 0.11 | _step_step: 344.49 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.312 | iteration 2520/ 143000 | elapsed time per iteration (ms): 62777.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.216475E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 19:32:37,033] [INFO] [logging.py:60:log_dist] [Rank 0] step=2530, skipped=0, lr=[0.0005999124042240578, 0.0005999124042240578], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2530 loss: 3.2229 iter time (s): 62.348 samples/sec: 16.424 %comms: 0.0028877671210829174 %optimizer_step 0.05726751524150481 %forward: 23.285881318791823 %backward: 62.563489247686476 [2025-03-26 19:32:37,033] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13974.16 | forward: 145182.86 | backward_microstep: 390078.38 | backward: 390070.96 | backward_inner_microstep: 390055.81 | backward_inner: 390049.96 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.49 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.20 | step: 357.05 | _step_clipping: 0.14 | _step_step: 355.31 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.424 | iteration 2530/ 143000 | elapsed time per iteration (ms): 62348.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.220263E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 19:43:00,418] [INFO] [logging.py:60:log_dist] [Rank 0] step=2540, skipped=0, lr=[0.00059991080441355, 0.00059991080441355], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2540 loss: 3.2284 iter time (s): 62.338 samples/sec: 16.427 %comms: 0.0028782884422026414 %optimizer_step 0.056923081001290385 %forward: 23.28950545823361 %backward: 62.54735395373615 [2025-03-26 19:43:00,419] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14126.22 | forward: 145182.06 | backward_microstep: 389915.83 | backward: 389907.53 | backward_inner_microstep: 389892.00 | backward_inner: 389886.04 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.54 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.18 | step: 354.85 | _step_clipping: 0.13 | _step_step: 352.92 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.426 | iteration 2540/ 143000 | elapsed time per iteration (ms): 62338.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.215069E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 19:53:23,681] [INFO] [logging.py:60:log_dist] [Rank 0] step=2550, skipped=0, lr=[0.0005999091901280019, 0.0005999091901280019], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2550 loss: 3.2127 iter time (s): 62.326 samples/sec: 16.430 %comms: 0.0029333684495966394 %optimizer_step 0.060304549854532055 %forward: 23.277966897070552 %backward: 62.55496538853986 [2025-03-26 19:53:23,681] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14166.18 | forward: 145081.47 | backward_microstep: 389885.26 | backward: 389877.97 | backward_inner_microstep: 389862.63 | backward_inner: 389856.65 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.52 | reduce_tied_grads: 0.31 | comms: 18.28 | reduce_grads: 0.19 | step: 375.85 | _step_clipping: 0.13 | _step_step: 374.06 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.430 | iteration 2550/ 143000 | elapsed time per iteration (ms): 62326.2 | learning rate: 5.999E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.208006E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 20:03:46,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=2560, skipped=0, lr=[0.0005999075613674914, 0.0005999075613674914], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2560 loss: 3.2025 iter time (s): 62.270 samples/sec: 16.445 %comms: 0.0028621934412506865 %optimizer_step 0.05648270011682335 %forward: 23.300638586048535 %backward: 62.6045852649063 [2025-03-26 20:03:46,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13737.36 | forward: 145091.97 | backward_microstep: 389842.30 | backward: 389835.78 | backward_inner_microstep: 389820.93 | backward_inner: 389815.27 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.49 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.18 | step: 351.72 | _step_clipping: 0.10 | _step_step: 349.95 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.445 | iteration 2560/ 143000 | elapsed time per iteration (ms): 62270.0 | learning rate: 5.999E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.206175E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 20:14:08,696] [INFO] [logging.py:60:log_dist] [Rank 0] step=2570, skipped=0, lr=[0.0005999059181320972, 0.0005999059181320972], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2570 loss: 3.2055 iter time (s): 62.231 samples/sec: 16.455 %comms: 0.002868640264120195 %optimizer_step 0.056052731174004326 %forward: 23.301609905887908 %backward: 62.631782726035624 [2025-03-26 20:14:08,696] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13563.40 | forward: 145008.20 | backward_microstep: 389769.69 | backward: 389763.72 | backward_inner_microstep: 389749.14 | backward_inner: 389743.69 | backward_allreduce_microstep: 7.09 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.17 | step: 348.82 | _step_clipping: 0.16 | _step_step: 347.19 | _step_zero_grad: 0.45 | _step_check_overflow: 0.49 samples/sec: 16.455 | iteration 2570/ 143000 | elapsed time per iteration (ms): 62231.5 | learning rate: 5.999E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 3.208460E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 20:24:31,350] [INFO] [logging.py:60:log_dist] [Rank 0] step=2580, skipped=0, lr=[0.0005999042604218987, 0.0005999042604218987], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2580 loss: 3.2111 iter time (s): 62.265 samples/sec: 16.446 %comms: 0.0028642457077507877 %optimizer_step 0.05601748102586745 %forward: 23.298399381599236 %backward: 62.61023735130608 [2025-03-26 20:24:31,350] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13791.90 | forward: 145067.16 | backward_microstep: 389848.59 | backward: 389841.78 | backward_inner_microstep: 389827.00 | backward_inner: 389821.21 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.44 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 348.79 | _step_clipping: 0.10 | _step_step: 347.18 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.446 | iteration 2580/ 143000 | elapsed time per iteration (ms): 62265.4 | learning rate: 5.999E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.204976E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 20:35:03,541] [INFO] [logging.py:60:log_dist] [Rank 0] step=2590, skipped=0, lr=[0.0005999025882369757, 0.0005999025882369757], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2590 loss: 3.1888 iter time (s): 63.219 samples/sec: 16.198 %comms: 0.002829066058126757 %optimizer_step 0.05454514032477841 %forward: 22.963761621117644 %backward: 61.685471118460974 [2025-03-26 20:35:03,542] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23150.22 | forward: 145173.77 | backward_microstep: 389973.91 | backward: 389967.13 | backward_inner_microstep: 389951.41 | backward_inner: 389945.65 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.56 | reduce_tied_grads: 0.45 | comms: 17.88 | reduce_grads: 0.19 | step: 344.83 | _step_clipping: 0.13 | _step_step: 343.06 | _step_zero_grad: 0.49 | _step_check_overflow: 0.61 samples/sec: 16.198 | iteration 2590/ 143000 | elapsed time per iteration (ms): 63219.2 | learning rate: 5.999E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.197123E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 20:45:31,376] [INFO] [logging.py:60:log_dist] [Rank 0] step=2600, skipped=0, lr=[0.000599900901577409, 0.000599900901577409], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2600 loss: 3.1890 iter time (s): 62.783 samples/sec: 16.310 %comms: 0.0032853752784051653 %optimizer_step 0.05666719817968229 %forward: 23.166409125875553 %backward: 62.183892619499304 [2025-03-26 20:45:31,377] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18028.40 | forward: 145445.48 | backward_microstep: 390417.17 | backward: 390408.63 | backward_inner_microstep: 390391.78 | backward_inner: 390385.45 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.77 | reduce_tied_grads: 0.29 | comms: 20.63 | reduce_grads: 0.20 | step: 355.77 | _step_clipping: 0.12 | _step_step: 354.04 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.310 | iteration 2600/ 143000 | elapsed time per iteration (ms): 62783.5 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.189744E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 20:55:59,312] [INFO] [logging.py:60:log_dist] [Rank 0] step=2610, skipped=0, lr=[0.0005998992004432799, 0.0005998992004432799], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2610 loss: 3.1882 iter time (s): 62.793 samples/sec: 16.308 %comms: 0.0028588687402516947 %optimizer_step 0.05627899253180067 %forward: 23.162574732976964 %backward: 62.18064873276315 [2025-03-26 20:55:59,313] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18121.81 | forward: 145445.02 | backward_microstep: 390460.17 | backward: 390451.65 | backward_inner_microstep: 390435.31 | backward_inner: 390429.16 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 353.39 | _step_clipping: 0.12 | _step_step: 351.73 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.307 | iteration 2610/ 143000 | elapsed time per iteration (ms): 62793.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.198672E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 21:06:24,520] [INFO] [logging.py:60:log_dist] [Rank 0] step=2620, skipped=0, lr=[0.0005998974848346706, 0.0005998974848346706], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2620 loss: 3.1911 iter time (s): 62.520 samples/sec: 16.379 %comms: 0.00315003535290515 %optimizer_step 0.05665094692134182 %forward: 23.234173974397464 %backward: 62.42110551067115 [2025-03-26 21:06:24,520] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15868.35 | forward: 145260.56 | backward_microstep: 390265.54 | backward: 390258.12 | backward_inner_microstep: 390242.48 | backward_inner: 390236.62 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 19.69 | reduce_grads: 0.19 | step: 354.18 | _step_clipping: 0.13 | _step_step: 352.48 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.379 | iteration 2620/ 143000 | elapsed time per iteration (ms): 62520.7 | learning rate: 5.999E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.185891E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 21:16:51,392] [INFO] [logging.py:60:log_dist] [Rank 0] step=2630, skipped=0, lr=[0.000599895754751664, 0.000599895754751664], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2630 loss: 3.1890 iter time (s): 62.687 samples/sec: 16.335 %comms: 0.0028745995717806222 %optimizer_step 0.05602500661028398 %forward: 23.197904368115367 %backward: 62.26767357118587 [2025-03-26 21:16:51,393] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17330.11 | forward: 145420.00 | backward_microstep: 390343.67 | backward: 390335.47 | backward_inner_microstep: 390319.12 | backward_inner: 390312.88 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.72 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.20 | step: 351.20 | _step_clipping: 0.12 | _step_step: 349.44 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.335 | iteration 2630/ 143000 | elapsed time per iteration (ms): 62687.3 | learning rate: 5.999E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.181752E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 21:27:18,015] [INFO] [logging.py:60:log_dist] [Rank 0] step=2640, skipped=0, lr=[0.0005998940101943435, 0.0005998940101943435], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2640 loss: 3.1725 iter time (s): 62.662 samples/sec: 16.342 %comms: 0.0029036374580684705 %optimizer_step 0.05580570208397741 %forward: 23.205715098485786 %backward: 62.31009555542744 [2025-03-26 21:27:18,015] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16979.84 | forward: 145410.87 | backward_microstep: 390455.42 | backward: 390445.43 | backward_inner_microstep: 390428.98 | backward_inner: 390422.75 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.74 | reduce_tied_grads: 0.32 | comms: 18.19 | reduce_grads: 0.20 | step: 349.69 | _step_clipping: 0.12 | _step_step: 348.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.342 | iteration 2640/ 143000 | elapsed time per iteration (ms): 62662.2 | learning rate: 5.999E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.187061E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 21:37:45,642] [INFO] [logging.py:60:log_dist] [Rank 0] step=2650, skipped=0, lr=[0.0005998922511627932, 0.0005998922511627932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2650 loss: 3.1759 iter time (s): 62.762 samples/sec: 16.316 %comms: 0.0028726221477000775 %optimizer_step 0.05748180736712205 %forward: 23.209238806854774 %backward: 62.23236639489094 [2025-03-26 21:37:45,643] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17614.86 | forward: 145666.32 | backward_microstep: 390593.19 | backward: 390584.12 | backward_inner_microstep: 390567.85 | backward_inner: 390561.66 | backward_allreduce_microstep: 7.79 | backward_allreduce: 2.67 | reduce_tied_grads: 0.30 | comms: 18.03 | reduce_grads: 0.21 | step: 360.77 | _step_clipping: 0.12 | _step_step: 359.05 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.315 | iteration 2650/ 143000 | elapsed time per iteration (ms): 62762.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.179272E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 21:48:11,823] [INFO] [logging.py:60:log_dist] [Rank 0] step=2660, skipped=0, lr=[0.0005998904776570982, 0.0005998904776570982], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2660 loss: 3.1700 iter time (s): 62.618 samples/sec: 16.353 %comms: 0.00289018721767876 %optimizer_step 0.05609469323243052 %forward: 23.23464925445512 %backward: 62.33910931501308 [2025-03-26 21:48:11,823] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16668.50 | forward: 145489.64 | backward_microstep: 390361.03 | backward: 390352.12 | backward_inner_microstep: 390335.93 | backward_inner: 390329.72 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.66 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.22 | step: 351.25 | _step_clipping: 0.12 | _step_step: 349.53 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.353 | iteration 2660/ 143000 | elapsed time per iteration (ms): 62618.1 | learning rate: 5.999E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.168844E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 21:58:39,251] [INFO] [logging.py:60:log_dist] [Rank 0] step=2670, skipped=0, lr=[0.0005998886896773439, 0.0005998886896773439], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2670 loss: 3.1789 iter time (s): 62.742 samples/sec: 16.321 %comms: 0.0029374149884672253 %optimizer_step 0.056438753862188565 %forward: 23.24748683658297 %backward: 62.257039484088104 [2025-03-26 21:58:39,251] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17284.00 | forward: 145859.90 | backward_microstep: 390625.84 | backward: 390614.50 | backward_inner_microstep: 390597.96 | backward_inner: 390591.71 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.74 | reduce_tied_grads: 0.49 | comms: 18.43 | reduce_grads: 0.21 | step: 354.11 | _step_clipping: 0.12 | _step_step: 352.32 | _step_zero_grad: 0.53 | _step_check_overflow: 0.52 samples/sec: 16.321 | iteration 2670/ 143000 | elapsed time per iteration (ms): 62742.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.181741E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 22:09:09,898] [INFO] [logging.py:60:log_dist] [Rank 0] step=2680, skipped=0, lr=[0.0005998868872236168, 0.0005998868872236168], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2680 loss: 3.1642 iter time (s): 63.064 samples/sec: 16.237 %comms: 0.002870358562220822 %optimizer_step 0.05609289453274366 %forward: 23.165253765958617 %backward: 61.97462529914809 [2025-03-26 22:09:09,899] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20071.43 | forward: 146089.86 | backward_microstep: 390850.34 | backward: 390838.14 | backward_inner_microstep: 390818.16 | backward_inner: 390811.60 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.78 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.21 | step: 353.75 | _step_clipping: 0.12 | _step_step: 351.96 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.237 | iteration 2680/ 143000 | elapsed time per iteration (ms): 63064.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.178232E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 22:19:39,135] [INFO] [logging.py:60:log_dist] [Rank 0] step=2690, skipped=0, lr=[0.0005998850702960037, 0.0005998850702960037], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2690 loss: 3.1599 iter time (s): 62.923 samples/sec: 16.274 %comms: 0.002873347854632877 %optimizer_step 0.057001434803653814 %forward: 23.204756582316744 %backward: 62.091326343015375 [2025-03-26 22:19:39,136] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18987.03 | forward: 146011.53 | backward_microstep: 390709.24 | backward: 390697.89 | backward_inner_microstep: 390679.21 | backward_inner: 390672.83 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.21 | step: 358.67 | _step_clipping: 0.12 | _step_step: 356.86 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.274 | iteration 2690/ 143000 | elapsed time per iteration (ms): 62923.7 | learning rate: 5.999E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.167921E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 22:30:09,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=2700, skipped=0, lr=[0.0005998832388945925, 0.0005998832388945925], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2700 loss: 3.1776 iter time (s): 63.024 samples/sec: 16.248 %comms: 0.002867424459864012 %optimizer_step 0.059995138845833046 %forward: 23.178563498509575 %backward: 62.0201603560466 [2025-03-26 22:30:09,381] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19769.53 | forward: 146080.54 | backward_microstep: 390887.38 | backward: 390875.76 | backward_inner_microstep: 390858.65 | backward_inner: 390850.37 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.20 | step: 378.11 | _step_clipping: 0.12 | _step_step: 376.28 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.248 | iteration 2700/ 143000 | elapsed time per iteration (ms): 63024.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.165032E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 22:40:40,059] [INFO] [logging.py:60:log_dist] [Rank 0] step=2710, skipped=0, lr=[0.0005998813930194714, 0.0005998813930194714], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2710 loss: 3.1683 iter time (s): 63.067 samples/sec: 16.237 %comms: 0.0028549089088649973 %optimizer_step 0.058351262376006865 %forward: 23.172436634189303 %backward: 61.9957432789878 [2025-03-26 22:40:40,060] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20067.13 | forward: 146142.24 | backward_microstep: 391004.47 | backward: 390990.26 | backward_inner_microstep: 390973.22 | backward_inner: 390966.83 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.76 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.20 | step: 368.01 | _step_clipping: 0.12 | _step_step: 366.03 | _step_zero_grad: 0.55 | _step_check_overflow: 0.72 samples/sec: 16.236 | iteration 2710/ 143000 | elapsed time per iteration (ms): 63067.8 | learning rate: 5.999E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.171773E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 22:51:08,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=2720, skipped=0, lr=[0.0005998795326707295, 0.0005998795326707295], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2720 loss: 3.1716 iter time (s): 62.859 samples/sec: 16.290 %comms: 0.0028707721368818836 %optimizer_step 0.05683774573921193 %forward: 23.21351527145827 %backward: 62.1630607100194 [2025-03-26 22:51:08,657] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18553.47 | forward: 145918.15 | backward_microstep: 390762.42 | backward: 390751.62 | backward_inner_microstep: 390734.89 | backward_inner: 390728.43 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.73 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.21 | step: 357.28 | _step_clipping: 0.13 | _step_step: 355.37 | _step_zero_grad: 0.50 | _step_check_overflow: 0.67 samples/sec: 16.290 | iteration 2720/ 143000 | elapsed time per iteration (ms): 62859.7 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.164438E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 23:01:39,923] [INFO] [logging.py:60:log_dist] [Rank 0] step=2730, skipped=0, lr=[0.0005998776578484568, 0.0005998776578484568], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2730 loss: 3.1656 iter time (s): 63.126 samples/sec: 16.221 %comms: 0.0038651266479069455 %optimizer_step 0.058813587823894384 %forward: 23.145100050342542 %backward: 61.92056106273145 [2025-03-26 23:01:39,924] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20888.17 | forward: 146106.02 | backward_microstep: 390894.65 | backward: 390880.43 | backward_inner_microstep: 390863.08 | backward_inner: 390856.54 | backward_allreduce_microstep: 8.32 | backward_allreduce: 3.00 | reduce_tied_grads: 0.36 | comms: 24.40 | reduce_grads: 0.23 | step: 371.27 | _step_clipping: 0.12 | _step_step: 369.45 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.221 | iteration 2730/ 143000 | elapsed time per iteration (ms): 63126.7 | learning rate: 5.999E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.164646E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 23:12:11,782] [INFO] [logging.py:60:log_dist] [Rank 0] step=2740, skipped=0, lr=[0.0005998757685527435, 0.0005998757685527435], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2740 loss: 3.1604 iter time (s): 63.185 samples/sec: 16.206 %comms: 0.0028783278301549873 %optimizer_step 0.05856146844087975 %forward: 23.164606502544313 %backward: 61.85917170359918 [2025-03-26 23:12:11,782] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21278.03 | forward: 146366.32 | backward_microstep: 390873.01 | backward: 390859.18 | backward_inner_microstep: 390841.76 | backward_inner: 390835.14 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.84 | reduce_tied_grads: 0.34 | comms: 18.19 | reduce_grads: 0.23 | step: 370.02 | _step_clipping: 0.11 | _step_step: 368.27 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.206 | iteration 2740/ 143000 | elapsed time per iteration (ms): 63185.9 | learning rate: 5.999E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.150316E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 23:22:40,051] [INFO] [logging.py:60:log_dist] [Rank 0] step=2750, skipped=0, lr=[0.000599873864783681, 0.000599873864783681], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2750 loss: 3.1715 iter time (s): 62.825 samples/sec: 16.299 %comms: 0.0028542453625499333 %optimizer_step 0.05541369444883978 %forward: 23.24259355699201 %backward: 62.19328803110756 [2025-03-26 23:22:40,051] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18220.98 | forward: 146022.75 | backward_microstep: 390743.58 | backward: 390732.42 | backward_inner_microstep: 390715.66 | backward_inner: 390709.09 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.74 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.19 | step: 348.14 | _step_clipping: 0.12 | _step_step: 346.46 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.299 | iteration 2750/ 143000 | elapsed time per iteration (ms): 62826.9 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.155274E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 23:33:08,426] [INFO] [logging.py:60:log_dist] [Rank 0] step=2760, skipped=0, lr=[0.0005998719465413611, 0.0005998719465413611], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2760 loss: 3.1721 iter time (s): 62.837 samples/sec: 16.296 %comms: 0.0028599859999593213 %optimizer_step 0.05602244250596252 %forward: 23.218715504338068 %backward: 62.173543404106645 [2025-03-26 23:33:08,426] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18582.39 | forward: 145899.31 | backward_microstep: 390695.38 | backward: 390679.53 | backward_inner_microstep: 390659.31 | backward_inner: 390652.91 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 352.03 | _step_clipping: 0.11 | _step_step: 350.25 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.296 | iteration 2760/ 143000 | elapsed time per iteration (ms): 62837.5 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.153684E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 23:43:36,823] [INFO] [logging.py:60:log_dist] [Rank 0] step=2770, skipped=0, lr=[0.0005998700138258764, 0.0005998700138258764], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2770 loss: 3.1409 iter time (s): 62.839 samples/sec: 16.296 %comms: 0.0028986245690261776 %optimizer_step 0.0572467539167636 %forward: 23.25085664101188 %backward: 62.16839864214657 [2025-03-26 23:43:36,823] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18445.23 | forward: 146106.34 | backward_microstep: 390672.05 | backward: 390660.77 | backward_inner_microstep: 390644.06 | backward_inner: 390637.67 | backward_allreduce_microstep: 7.92 | backward_allreduce: 2.72 | reduce_tied_grads: 0.31 | comms: 18.21 | reduce_grads: 0.20 | step: 359.73 | _step_clipping: 0.11 | _step_step: 358.05 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.295 | iteration 2770/ 143000 | elapsed time per iteration (ms): 62839.7 | learning rate: 5.999E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.146126E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-26 23:54:07,099] [INFO] [logging.py:60:log_dist] [Rank 0] step=2780, skipped=0, lr=[0.00059986806663732, 0.00059986806663732], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2780 loss: 3.1383 iter time (s): 63.027 samples/sec: 16.247 %comms: 0.0028411432254503975 %optimizer_step 0.05763003713132891 %forward: 23.18108779974579 %backward: 61.98420164354456 [2025-03-26 23:54:07,100] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20363.35 | forward: 146103.69 | backward_microstep: 390681.27 | backward: 390668.50 | backward_inner_microstep: 390650.87 | backward_inner: 390644.21 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.88 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.20 | step: 363.23 | _step_clipping: 0.12 | _step_step: 361.56 | _step_zero_grad: 0.49 | _step_check_overflow: 0.48 samples/sec: 16.247 | iteration 2780/ 143000 | elapsed time per iteration (ms): 63027.6 | learning rate: 5.999E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.149677E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 00:04:42,713] [INFO] [logging.py:60:log_dist] [Rank 0] step=2790, skipped=0, lr=[0.0005998661049757863, 0.0005998661049757863], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2790 loss: 3.1551 iter time (s): 63.561 samples/sec: 16.111 %comms: 0.002856070581716201 %optimizer_step 0.059738654233825815 %forward: 23.093622006422155 %backward: 61.509448902964394 [2025-03-27 00:04:42,714] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24733.28 | forward: 146785.04 | backward_microstep: 390977.49 | backward: 390959.32 | backward_inner_microstep: 390940.41 | backward_inner: 390933.35 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.05 | reduce_tied_grads: 0.38 | comms: 18.15 | reduce_grads: 0.25 | step: 379.70 | _step_clipping: 0.14 | _step_step: 377.72 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 16.110 | iteration 2790/ 143000 | elapsed time per iteration (ms): 63561.5 | learning rate: 5.999E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 3.145909E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 00:15:24,289] [INFO] [logging.py:60:log_dist] [Rank 0] step=2800, skipped=0, lr=[0.0005998641288413695, 0.0005998641288413695], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2800 loss: 3.1316 iter time (s): 64.157 samples/sec: 15.961 %comms: 0.0036854111640065358 %optimizer_step 0.05837366192259886 %forward: 22.952323885393795 %backward: 60.957777416154535 [2025-03-27 00:15:24,289] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30165.18 | forward: 147254.94 | backward_microstep: 391104.30 | backward: 391086.07 | backward_inner_microstep: 391066.90 | backward_inner: 391059.79 | backward_allreduce_microstep: 8.98 | backward_allreduce: 3.09 | reduce_tied_grads: 0.34 | comms: 23.64 | reduce_grads: 0.23 | step: 374.51 | _step_clipping: 0.13 | _step_step: 372.52 | _step_zero_grad: 0.59 | _step_check_overflow: 0.62 samples/sec: 15.961 | iteration 2800/ 143000 | elapsed time per iteration (ms): 64157.5 | learning rate: 5.999E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 3.145453E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 00:26:03,239] [INFO] [logging.py:60:log_dist] [Rank 0] step=2810, skipped=0, lr=[0.0005998621382341654, 0.0005998621382341654], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2810 loss: 3.1495 iter time (s): 63.894 samples/sec: 16.027 %comms: 0.0028240265711925354 %optimizer_step 0.0601504114366705 %forward: 23.017566249408237 %backward: 61.18921554877544 [2025-03-27 00:26:03,239] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27863.05 | forward: 147067.79 | backward_microstep: 390978.25 | backward: 390960.66 | backward_inner_microstep: 390942.06 | backward_inner: 390935.00 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.02 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.21 | step: 384.32 | _step_clipping: 0.12 | _step_step: 382.43 | _step_zero_grad: 0.58 | _step_check_overflow: 0.54 samples/sec: 16.026 | iteration 2810/ 143000 | elapsed time per iteration (ms): 63895.0 | learning rate: 5.999E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 3.139274E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 00:36:35,932] [INFO] [logging.py:60:log_dist] [Rank 0] step=2820, skipped=0, lr=[0.0005998601331542699, 0.0005998601331542699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2820 loss: 3.1330 iter time (s): 63.269 samples/sec: 16.185 %comms: 0.0028550117381499693 %optimizer_step 0.05966704342348122 %forward: 23.11619463924632 %backward: 61.768553415384886 [2025-03-27 00:36:35,933] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22697.65 | forward: 146253.31 | backward_microstep: 390816.37 | backward: 390802.01 | backward_inner_microstep: 390783.86 | backward_inner: 390776.95 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.97 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.23 | step: 377.51 | _step_clipping: 0.14 | _step_step: 375.65 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 16.185 | iteration 2820/ 143000 | elapsed time per iteration (ms): 63269.3 | learning rate: 5.999E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.145657E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 00:47:08,667] [INFO] [logging.py:60:log_dist] [Rank 0] step=2830, skipped=0, lr=[0.0005998581136017797, 0.0005998581136017797], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2830 loss: 3.1333 iter time (s): 63.273 samples/sec: 16.184 %comms: 0.002844234459456474 %optimizer_step 0.05700816970532724 %forward: 23.118361107444983 %backward: 61.75542988651227 [2025-03-27 00:47:08,669] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22806.24 | forward: 146276.70 | backward_microstep: 390758.73 | backward: 390744.84 | backward_inner_microstep: 390725.00 | backward_inner: 390718.10 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.96 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.23 | step: 360.71 | _step_clipping: 0.11 | _step_step: 358.89 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.184 | iteration 2830/ 143000 | elapsed time per iteration (ms): 63273.6 | learning rate: 5.999E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.138600E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 00:57:42,748] [INFO] [logging.py:60:log_dist] [Rank 0] step=2840, skipped=0, lr=[0.0005998560795767923, 0.0005998560795767923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2840 loss: 3.1548 iter time (s): 63.407 samples/sec: 16.150 %comms: 0.0028673075211779722 %optimizer_step 0.056332350258510126 %forward: 23.07044704679023 %backward: 61.65201338137105 [2025-03-27 00:57:42,748] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24005.15 | forward: 146283.67 | backward_microstep: 390934.12 | backward: 390919.29 | backward_inner_microstep: 390901.19 | backward_inner: 390894.33 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.95 | reduce_tied_grads: 0.31 | comms: 18.18 | reduce_grads: 0.22 | step: 357.19 | _step_clipping: 0.12 | _step_step: 355.35 | _step_zero_grad: 0.54 | _step_check_overflow: 0.54 samples/sec: 16.149 | iteration 2840/ 143000 | elapsed time per iteration (ms): 63408.0 | learning rate: 5.999E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 3.137931E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 01:08:19,883] [INFO] [logging.py:60:log_dist] [Rank 0] step=2850, skipped=0, lr=[0.0005998540310794059, 0.0005998540310794059], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2850 loss: 3.1206 iter time (s): 63.713 samples/sec: 16.072 %comms: 0.0028599908316465474 %optimizer_step 0.05807591054289207 %forward: 23.01198269212791 %backward: 61.38411336889599 [2025-03-27 01:08:19,884] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26556.28 | forward: 146616.21 | backward_microstep: 391114.82 | backward: 391096.51 | backward_inner_microstep: 391077.77 | backward_inner: 391070.83 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.02 | reduce_tied_grads: 0.31 | comms: 18.22 | reduce_grads: 0.22 | step: 370.02 | _step_clipping: 0.11 | _step_step: 368.22 | _step_zero_grad: 0.54 | _step_check_overflow: 0.53 samples/sec: 16.072 | iteration 2850/ 143000 | elapsed time per iteration (ms): 63713.5 | learning rate: 5.999E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 3.125838E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 01:18:56,445] [INFO] [logging.py:60:log_dist] [Rank 0] step=2860, skipped=0, lr=[0.0005998519681097194, 0.0005998519681097194], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2860 loss: 3.1431 iter time (s): 63.656 samples/sec: 16.087 %comms: 0.0028320049314909555 %optimizer_step 0.06114137293372038 %forward: 23.018748262547163 %backward: 61.41963906690427 [2025-03-27 01:18:56,446] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26211.49 | forward: 146527.29 | backward_microstep: 390988.95 | backward: 390970.58 | backward_inner_microstep: 390952.32 | backward_inner: 390945.51 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.98 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.21 | step: 389.20 | _step_clipping: 0.23 | _step_step: 387.27 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 16.086 | iteration 2860/ 143000 | elapsed time per iteration (ms): 63656.2 | learning rate: 5.999E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 3.132965E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 01:29:24,411] [INFO] [logging.py:60:log_dist] [Rank 0] step=2870, skipped=0, lr=[0.0005998498906678323, 0.0005998498906678323], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2870 loss: 3.1456 iter time (s): 62.796 samples/sec: 16.307 %comms: 0.0028906677077160745 %optimizer_step 0.06249204730371011 %forward: 23.228575547248965 %backward: 62.20661410406952 [2025-03-27 01:29:24,412] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18699.85 | forward: 145866.16 | backward_microstep: 390644.06 | backward: 390632.65 | backward_inner_microstep: 390615.98 | backward_inner: 390609.63 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.70 | reduce_tied_grads: 0.34 | comms: 18.15 | reduce_grads: 0.21 | step: 392.43 | _step_clipping: 0.11 | _step_step: 390.59 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 16.307 | iteration 2870/ 143000 | elapsed time per iteration (ms): 62796.6 | learning rate: 5.998E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.133488E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 01:39:55,825] [INFO] [logging.py:60:log_dist] [Rank 0] step=2880, skipped=0, lr=[0.000599847798753845, 0.000599847798753845], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2880 loss: 3.1307 iter time (s): 63.141 samples/sec: 16.218 %comms: 0.0028387065893981742 %optimizer_step 0.05554773109471463 %forward: 23.07593901865568 %backward: 61.83833487810582 [2025-03-27 01:39:55,825] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22598.62 | forward: 145703.42 | backward_microstep: 390461.51 | backward: 390452.44 | backward_inner_microstep: 390436.19 | backward_inner: 390430.00 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 350.73 | _step_clipping: 0.12 | _step_step: 349.02 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.218 | iteration 2880/ 143000 | elapsed time per iteration (ms): 63141.4 | learning rate: 5.998E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.119243E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 01:50:27,086] [INFO] [logging.py:60:log_dist] [Rank 0] step=2890, skipped=0, lr=[0.0005998456923678584, 0.0005998456923678584], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2890 loss: 3.1292 iter time (s): 63.125 samples/sec: 16.222 %comms: 0.0028794766097976123 %optimizer_step 0.05771552983456614 %forward: 23.092993938016416 %backward: 61.866369716016145 [2025-03-27 01:50:27,086] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22325.82 | forward: 145775.31 | backward_microstep: 390543.67 | backward: 390533.56 | backward_inner_microstep: 390516.55 | backward_inner: 390510.09 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.80 | reduce_tied_grads: 0.33 | comms: 18.18 | reduce_grads: 0.21 | step: 364.33 | _step_clipping: 0.13 | _step_step: 362.59 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.222 | iteration 2890/ 143000 | elapsed time per iteration (ms): 63126.1 | learning rate: 5.998E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.122279E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 02:00:58,097] [INFO] [logging.py:60:log_dist] [Rank 0] step=2900, skipped=0, lr=[0.000599843571509974, 0.000599843571509974], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2900 loss: 3.1241 iter time (s): 63.101 samples/sec: 16.228 %comms: 0.0028382889093574275 %optimizer_step 0.05629249703486203 %forward: 23.10554153411128 %backward: 61.878763129485016 [2025-03-27 02:00:58,098] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22214.10 | forward: 145797.29 | backward_microstep: 390467.82 | backward: 390458.53 | backward_inner_microstep: 390441.54 | backward_inner: 390435.14 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.98 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.19 | step: 355.21 | _step_clipping: 0.13 | _step_step: 353.35 | _step_zero_grad: 0.49 | _step_check_overflow: 0.67 samples/sec: 16.228 | iteration 2900/ 143000 | elapsed time per iteration (ms): 63101.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 3.128455E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 02:11:26,841] [INFO] [logging.py:60:log_dist] [Rank 0] step=2910, skipped=0, lr=[0.0005998414361802944, 0.0005998414361802944], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2910 loss: 3.1288 iter time (s): 62.874 samples/sec: 16.287 %comms: 0.0028661574153930423 %optimizer_step 0.05638644450182995 %forward: 23.181850290541767 %backward: 62.0907740645256 [2025-03-27 02:11:26,842] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20124.06 | forward: 145753.18 | backward_microstep: 390396.97 | backward: 390388.49 | backward_inner_microstep: 390371.81 | backward_inner: 390365.56 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.75 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 354.52 | _step_clipping: 0.11 | _step_step: 352.84 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.286 | iteration 2910/ 143000 | elapsed time per iteration (ms): 62874.4 | learning rate: 5.998E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.119190E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 02:21:54,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=2920, skipped=0, lr=[0.0005998392863789224, 0.0005998392863789224], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2920 loss: 3.1076 iter time (s): 62.774 samples/sec: 16.312 %comms: 0.002863857568669772 %optimizer_step 0.05658059485372269 %forward: 23.20388596348866 %backward: 62.196965317452005 [2025-03-27 02:21:54,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19163.70 | forward: 145661.17 | backward_microstep: 390448.59 | backward: 390438.16 | backward_inner_microstep: 390421.49 | backward_inner: 390415.18 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.82 | reduce_tied_grads: 0.28 | comms: 17.98 | reduce_grads: 0.20 | step: 355.18 | _step_clipping: 0.12 | _step_step: 353.45 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.312 | iteration 2920/ 143000 | elapsed time per iteration (ms): 62775.0 | learning rate: 5.998E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.116731E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 02:32:20,815] [INFO] [logging.py:60:log_dist] [Rank 0] step=2930, skipped=0, lr=[0.000599837122105962, 0.000599837122105962], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2930 loss: 3.1139 iter time (s): 62.622 samples/sec: 16.352 %comms: 0.002862234089356564 %optimizer_step 0.05679176821232235 %forward: 23.22723758855367 %backward: 62.3212339869138 [2025-03-27 02:32:20,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18102.32 | forward: 145453.20 | backward_microstep: 390274.39 | backward: 390266.93 | backward_inner_microstep: 390250.93 | backward_inner: 390245.00 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 355.64 | _step_clipping: 0.11 | _step_step: 353.94 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.352 | iteration 2930/ 143000 | elapsed time per iteration (ms): 62622.4 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.118945E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 02:42:47,553] [INFO] [logging.py:60:log_dist] [Rank 0] step=2940, skipped=0, lr=[0.0005998349433615177, 0.0005998349433615177], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2940 loss: 3.1376 iter time (s): 62.673 samples/sec: 16.339 %comms: 0.002861638150868939 %optimizer_step 0.05725992468971368 %forward: 23.22192129283005 %backward: 62.27192291908763 [2025-03-27 02:42:47,553] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18561.46 | forward: 145539.19 | backward_microstep: 390284.99 | backward: 390278.01 | backward_inner_microstep: 390261.84 | backward_inner: 390255.83 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.70 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.19 | step: 358.87 | _step_clipping: 0.12 | _step_step: 357.13 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.339 | iteration 2940/ 143000 | elapsed time per iteration (ms): 62673.7 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.115987E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 02:53:13,357] [INFO] [logging.py:60:log_dist] [Rank 0] step=2950, skipped=0, lr=[0.0005998327501456945, 0.0005998327501456945], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2950 loss: 3.1154 iter time (s): 62.580 samples/sec: 16.363 %comms: 0.002895390319622457 %optimizer_step 0.05809841190534514 %forward: 23.238772944021463 %backward: 62.356874937557926 [2025-03-27 02:53:13,358] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17862.32 | forward: 145428.10 | backward_microstep: 390236.24 | backward: 390228.95 | backward_inner_microstep: 390212.90 | backward_inner: 390206.98 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.69 | reduce_tied_grads: 0.28 | comms: 18.12 | reduce_grads: 0.20 | step: 363.58 | _step_clipping: 0.11 | _step_step: 361.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.363 | iteration 2950/ 143000 | elapsed time per iteration (ms): 62580.5 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.116016E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 03:03:40,339] [INFO] [logging.py:60:log_dist] [Rank 0] step=2960, skipped=0, lr=[0.0005998305424585982, 0.0005998305424585982], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2960 loss: 3.1168 iter time (s): 62.698 samples/sec: 16.332 %comms: 0.0028668772640153676 %optimizer_step 0.0577566384655246 %forward: 23.249643089923 %backward: 62.2723475341521 [2025-03-27 03:03:40,339] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18499.89 | forward: 145769.54 | backward_microstep: 390441.84 | backward: 390432.30 | backward_inner_microstep: 390416.02 | backward_inner: 390409.77 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.66 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 362.12 | _step_clipping: 0.11 | _step_step: 360.36 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.332 | iteration 2960/ 143000 | elapsed time per iteration (ms): 62698.1 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.110851E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 03:14:07,221] [INFO] [logging.py:60:log_dist] [Rank 0] step=2970, skipped=0, lr=[0.0005998283203003355, 0.0005998283203003355], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2970 loss: 3.1054 iter time (s): 62.688 samples/sec: 16.335 %comms: 0.003202511322634069 %optimizer_step 0.05781646536845335 %forward: 23.214763259851914 %backward: 62.276857106123686 [2025-03-27 03:14:07,222] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18719.79 | forward: 145527.95 | backward_microstep: 390407.86 | backward: 390399.12 | backward_inner_microstep: 390382.74 | backward_inner: 390376.55 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.71 | reduce_tied_grads: 0.29 | comms: 20.08 | reduce_grads: 0.20 | step: 362.44 | _step_clipping: 0.11 | _step_step: 360.73 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.335 | iteration 2970/ 143000 | elapsed time per iteration (ms): 62688.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.111454E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 03:24:35,050] [INFO] [logging.py:60:log_dist] [Rank 0] step=2980, skipped=0, lr=[0.0005998260836710135, 0.0005998260836710135], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2980 loss: 3.0988 iter time (s): 62.782 samples/sec: 16.310 %comms: 0.0028567383929577173 %optimizer_step 0.05578383052909456 %forward: 23.163930323876414 %backward: 62.17867420720841 [2025-03-27 03:24:35,050] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19822.94 | forward: 145428.61 | backward_microstep: 390382.40 | backward: 390372.35 | backward_inner_microstep: 390356.41 | backward_inner: 390350.25 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.59 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.19 | step: 350.22 | _step_clipping: 0.12 | _step_step: 348.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.310 | iteration 2980/ 143000 | elapsed time per iteration (ms): 62782.9 | learning rate: 5.998E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.105998E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 03:35:05,247] [INFO] [logging.py:60:log_dist] [Rank 0] step=2990, skipped=0, lr=[0.0005998238325707403, 0.0005998238325707403], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 2990 loss: 3.0999 iter time (s): 63.019 samples/sec: 16.249 %comms: 0.002844227008665873 %optimizer_step 0.05547689768922708 %forward: 23.087553964212624 %backward: 61.94504441315388 [2025-03-27 03:35:05,248] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22215.29 | forward: 145495.75 | backward_microstep: 390379.72 | backward: 390372.26 | backward_inner_microstep: 390356.60 | backward_inner: 390350.63 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.59 | reduce_tied_grads: 0.31 | comms: 17.92 | reduce_grads: 0.19 | step: 349.61 | _step_clipping: 0.13 | _step_step: 347.88 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.249 | iteration 2990/ 143000 | elapsed time per iteration (ms): 63019.7 | learning rate: 5.998E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.102219E+00 | loss scale: 16384.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 03:45:26,954] [INFO] [logging.py:60:log_dist] [Rank 0] step=3000, skipped=0, lr=[0.0005998215669996246, 0.0005998215669996246], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3000 loss: 3.0784 iter time (s): 62.170 samples/sec: 16.471 %comms: 0.002886361480356217 %optimizer_step 0.056903677985644906 %forward: 23.371589835288628 %backward: 62.7520730110653 [2025-03-27 03:45:26,955] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14191.45 | forward: 145301.70 | backward_microstep: 390137.43 | backward: 390131.05 | backward_inner_microstep: 390115.87 | backward_inner: 390110.18 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.19 | step: 353.77 | _step_clipping: 0.12 | _step_step: 351.97 | _step_zero_grad: 0.48 | _step_check_overflow: 0.65 samples/sec: 16.471 | iteration 3000/ 143000 | elapsed time per iteration (ms): 62170.8 | learning rate: 5.998E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 3.098266E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 03:45:29,792] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step3000/mp_rank_00_model_states.pt [2025-03-27 03:45:43,706] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-27 03:45:43,712] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-27 03:56:16,764] [INFO] [logging.py:60:log_dist] [Rank 0] step=3010, skipped=0, lr=[0.0005998192869577755, 0.0005998192869577755], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3010 loss: 3.0986 iter time (s): 63.304 samples/sec: 16.176 %comms: 0.0028162928187062163 %optimizer_step 0.055572454639026644 %forward: 22.995744080525977 %backward: 61.66592384497076 [2025-03-27 03:56:16,765] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24980.34 | forward: 145571.98 | backward_microstep: 390378.48 | backward: 390369.22 | backward_inner_microstep: 390352.95 | backward_inner: 390346.68 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.69 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 351.80 | _step_clipping: 0.12 | _step_step: 350.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 15.758 | iteration 3010/ 143000 | elapsed time per iteration (ms): 64981.0 | learning rate: 5.998E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 3.106089E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 04:06:41,713] [INFO] [logging.py:60:log_dist] [Rank 0] step=3020, skipped=0, lr=[0.0005998169924453032, 0.0005998169924453032], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3020 loss: 3.1041 iter time (s): 62.494 samples/sec: 16.386 %comms: 0.0029681113279776787 %optimizer_step 0.05791708443739683 %forward: 23.304543343976565 %backward: 62.4817594823253 [2025-03-27 04:06:41,713] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16709.29 | forward: 145639.81 | backward_microstep: 390484.11 | backward: 390474.57 | backward_inner_microstep: 390458.06 | backward_inner: 390451.63 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.35 | comms: 18.55 | reduce_grads: 0.22 | step: 361.95 | _step_clipping: 0.15 | _step_step: 359.95 | _step_zero_grad: 0.59 | _step_check_overflow: 0.63 samples/sec: 16.385 | iteration 3020/ 143000 | elapsed time per iteration (ms): 62494.8 | learning rate: 5.998E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.098902E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 04:17:14,436] [INFO] [logging.py:60:log_dist] [Rank 0] step=3030, skipped=0, lr=[0.0005998146834623184, 0.0005998146834623184], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3030 loss: 3.1006 iter time (s): 63.272 samples/sec: 16.184 %comms: 0.0028448168319826447 %optimizer_step 0.05518826333541239 %forward: 22.998162015302974 %backward: 61.68657612645963 [2025-03-27 04:17:14,436] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24860.03 | forward: 145513.36 | backward_microstep: 390311.07 | backward: 390301.66 | backward_inner_microstep: 390283.17 | backward_inner: 390276.97 | backward_allreduce_microstep: 10.05 | backward_allreduce: 2.73 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.20 | step: 349.19 | _step_clipping: 0.15 | _step_step: 347.41 | _step_zero_grad: 0.52 | _step_check_overflow: 0.52 samples/sec: 16.184 | iteration 3030/ 143000 | elapsed time per iteration (ms): 63272.3 | learning rate: 5.998E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.096800E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 04:27:42,637] [INFO] [logging.py:60:log_dist] [Rank 0] step=3040, skipped=0, lr=[0.0005998123600089326, 0.0005998123600089326], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3040 loss: 3.0651 iter time (s): 62.820 samples/sec: 16.301 %comms: 0.002883321136100441 %optimizer_step 0.05736674648119786 %forward: 23.198113674998222 %backward: 62.154792876891285 [2025-03-27 04:27:42,638] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19951.59 | forward: 145729.54 | backward_microstep: 390464.34 | backward: 390453.70 | backward_inner_microstep: 390437.46 | backward_inner: 390431.16 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.65 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.19 | step: 360.38 | _step_clipping: 0.15 | _step_step: 358.53 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.300 | iteration 3040/ 143000 | elapsed time per iteration (ms): 62820.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.085416E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 04:38:07,634] [INFO] [logging.py:60:log_dist] [Rank 0] step=3050, skipped=0, lr=[0.0005998100220852579, 0.0005998100220852579], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3050 loss: 3.0828 iter time (s): 62.499 samples/sec: 16.384 %comms: 0.00286835332912434 %optimizer_step 0.056574894311435794 %forward: 23.282555352705554 %backward: 62.44833716696366 [2025-03-27 04:38:07,635] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17161.99 | forward: 145513.72 | backward_microstep: 390305.55 | backward: 390296.08 | backward_inner_microstep: 390279.71 | backward_inner: 390273.34 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.68 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.21 | step: 353.59 | _step_clipping: 0.16 | _step_step: 351.84 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.384 | iteration 3050/ 143000 | elapsed time per iteration (ms): 62499.7 | learning rate: 5.998E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.077175E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 04:48:34,021] [INFO] [logging.py:60:log_dist] [Rank 0] step=3060, skipped=0, lr=[0.0005998076696914071, 0.0005998076696914071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3060 loss: 3.1084 iter time (s): 62.638 samples/sec: 16.348 %comms: 0.0028584488022868858 %optimizer_step 0.05639702206348546 %forward: 23.233515561214404 %backward: 62.311858397989816 [2025-03-27 04:48:34,021] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18520.47 | forward: 145530.15 | backward_microstep: 390319.13 | backward: 390309.17 | backward_inner_microstep: 390292.74 | backward_inner: 390286.44 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 17.90 | reduce_grads: 0.18 | step: 353.26 | _step_clipping: 0.12 | _step_step: 351.38 | _step_zero_grad: 0.46 | _step_check_overflow: 0.74 samples/sec: 16.348 | iteration 3060/ 143000 | elapsed time per iteration (ms): 62638.6 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.082831E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 04:59:01,870] [INFO] [logging.py:60:log_dist] [Rank 0] step=3070, skipped=0, lr=[0.0005998053028274939, 0.0005998053028274939], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3070 loss: 3.0623 iter time (s): 62.784 samples/sec: 16.310 %comms: 0.0028695255116624775 %optimizer_step 0.05795123826481454 %forward: 23.25591459324493 %backward: 62.20928420368699 [2025-03-27 04:59:01,870] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19255.69 | forward: 146010.51 | backward_microstep: 390588.97 | backward: 390576.31 | backward_inner_microstep: 390559.16 | backward_inner: 390552.53 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 363.84 | _step_clipping: 0.12 | _step_step: 362.02 | _step_zero_grad: 0.57 | _step_check_overflow: 0.53 samples/sec: 16.310 | iteration 3070/ 143000 | elapsed time per iteration (ms): 62784.9 | learning rate: 5.998E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.075075E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 05:09:27,626] [INFO] [logging.py:60:log_dist] [Rank 0] step=3080, skipped=0, lr=[0.0005998029214936324, 0.0005998029214936324], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3080 loss: 3.0907 iter time (s): 62.575 samples/sec: 16.364 %comms: 0.002908376841567667 %optimizer_step 0.05918281878931717 %forward: 23.34741510396544 %backward: 62.41357030368573 [2025-03-27 05:09:27,627] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17126.86 | forward: 146096.75 | backward_microstep: 390564.56 | backward: 390553.72 | backward_inner_microstep: 390536.79 | backward_inner: 390530.42 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.65 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.21 | step: 370.34 | _step_clipping: 0.12 | _step_step: 368.47 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.364 | iteration 3080/ 143000 | elapsed time per iteration (ms): 62575.7 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.081659E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 05:19:53,913] [INFO] [logging.py:60:log_dist] [Rank 0] step=3090, skipped=0, lr=[0.0005998005256899374, 0.0005998005256899374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3090 loss: 3.1088 iter time (s): 62.628 samples/sec: 16.350 %comms: 0.002867733957554668 %optimizer_step 0.05920913188273228 %forward: 23.273703502759858 %backward: 62.35107652081264 [2025-03-27 05:19:53,914] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18089.29 | forward: 145758.77 | backward_microstep: 390503.80 | backward: 390492.92 | backward_inner_microstep: 390476.29 | backward_inner: 390469.92 | backward_allreduce_microstep: 7.92 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 370.82 | _step_clipping: 0.12 | _step_step: 368.97 | _step_zero_grad: 0.50 | _step_check_overflow: 0.65 samples/sec: 16.350 | iteration 3090/ 143000 | elapsed time per iteration (ms): 62628.7 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.074403E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 05:30:18,005] [INFO] [logging.py:60:log_dist] [Rank 0] step=3100, skipped=0, lr=[0.0005997981154165249, 0.0005997981154165249], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3100 loss: 3.0685 iter time (s): 62.409 samples/sec: 16.408 %comms: 0.0029008537371128175 %optimizer_step 0.0578073031295124 %forward: 23.3321687220381 %backward: 62.551658031293364 [2025-03-27 05:30:18,006] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16235.22 | forward: 145612.93 | backward_microstep: 390388.08 | backward: 390376.48 | backward_inner_microstep: 390355.57 | backward_inner: 390349.14 | backward_allreduce_microstep: 11.86 | backward_allreduce: 4.99 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.21 | step: 360.77 | _step_clipping: 0.11 | _step_step: 358.92 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.408 | iteration 3100/ 143000 | elapsed time per iteration (ms): 62409.3 | learning rate: 5.998E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.078884E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 05:40:41,083] [INFO] [logging.py:60:log_dist] [Rank 0] step=3110, skipped=0, lr=[0.0005997956906735108, 0.0005997956906735108], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3110 loss: 3.0547 iter time (s): 62.307 samples/sec: 16.435 %comms: 0.002907231638906316 %optimizer_step 0.056904000681192814 %forward: 23.33444100175096 %backward: 62.63721748005763 [2025-03-27 05:40:41,083] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15597.20 | forward: 145389.92 | backward_microstep: 390283.32 | backward: 390273.75 | backward_inner_microstep: 390257.54 | backward_inner: 390251.31 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.63 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.19 | step: 354.55 | _step_clipping: 0.11 | _step_step: 352.85 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.435 | iteration 3110/ 143000 | elapsed time per iteration (ms): 62307.7 | learning rate: 5.998E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.071482E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 05:51:05,452] [INFO] [logging.py:60:log_dist] [Rank 0] step=3120, skipped=0, lr=[0.0005997932514610124, 0.0005997932514610124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3120 loss: 3.0749 iter time (s): 62.436 samples/sec: 16.401 %comms: 0.002875087190555061 %optimizer_step 0.05675402715756222 %forward: 23.300134273506345 %backward: 62.518915071926095 [2025-03-27 05:51:05,452] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16726.40 | forward: 145477.67 | backward_microstep: 390355.35 | backward: 390345.65 | backward_inner_microstep: 390329.01 | backward_inner: 390322.56 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.72 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.19 | step: 354.35 | _step_clipping: 0.11 | _step_step: 352.50 | _step_zero_grad: 0.45 | _step_check_overflow: 0.72 samples/sec: 16.401 | iteration 3120/ 143000 | elapsed time per iteration (ms): 62436.9 | learning rate: 5.998E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.069739E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 06:01:30,636] [INFO] [logging.py:60:log_dist] [Rank 0] step=3130, skipped=0, lr=[0.0005997907977791474, 0.0005997907977791474], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3130 loss: 3.0707 iter time (s): 62.518 samples/sec: 16.379 %comms: 0.002871188175399725 %optimizer_step 0.056443323374090665 %forward: 23.27531894003767 %backward: 62.447060642251884 [2025-03-27 06:01:30,637] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17474.12 | forward: 145512.35 | backward_microstep: 390415.83 | backward: 390405.75 | backward_inner_microstep: 390389.23 | backward_inner: 390380.79 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.20 | step: 352.87 | _step_clipping: 0.12 | _step_step: 351.19 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.379 | iteration 3130/ 143000 | elapsed time per iteration (ms): 62518.4 | learning rate: 5.998E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.069715E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 06:11:56,167] [INFO] [logging.py:60:log_dist] [Rank 0] step=3140, skipped=0, lr=[0.0005997883296280343, 0.0005997883296280343], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3140 loss: 3.0652 iter time (s): 62.553 samples/sec: 16.370 %comms: 0.0028955916699425215 %optimizer_step 0.05902177844321525 %forward: 23.270254940350394 %backward: 62.405293127241436 [2025-03-27 06:11:56,168] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17817.33 | forward: 145561.34 | backward_microstep: 390371.07 | backward: 390360.92 | backward_inner_microstep: 390344.17 | backward_inner: 390337.56 | backward_allreduce_microstep: 7.95 | backward_allreduce: 2.73 | reduce_tied_grads: 0.34 | comms: 18.11 | reduce_grads: 0.21 | step: 369.20 | _step_clipping: 0.12 | _step_step: 367.16 | _step_zero_grad: 0.55 | _step_check_overflow: 0.73 samples/sec: 16.370 | iteration 3140/ 143000 | elapsed time per iteration (ms): 62553.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.070088E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 06:22:21,940] [INFO] [logging.py:60:log_dist] [Rank 0] step=3150, skipped=0, lr=[0.000599785847007792, 0.000599785847007792], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3150 loss: 3.0878 iter time (s): 62.577 samples/sec: 16.364 %comms: 0.0028744367590434303 %optimizer_step 0.05637799801374606 %forward: 23.265134216053035 %backward: 62.39523863122181 [2025-03-27 06:22:21,940] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17959.67 | forward: 145585.33 | backward_microstep: 390458.23 | backward: 390448.26 | backward_inner_microstep: 390431.48 | backward_inner: 390425.03 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.75 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.21 | step: 352.79 | _step_clipping: 0.12 | _step_step: 351.18 | _step_zero_grad: 0.46 | _step_check_overflow: 0.45 samples/sec: 16.364 | iteration 3150/ 143000 | elapsed time per iteration (ms): 62577.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.060466E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 06:32:45,490] [INFO] [logging.py:60:log_dist] [Rank 0] step=3160, skipped=0, lr=[0.0005997833499185404, 0.0005997833499185404], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3160 loss: 3.0434 iter time (s): 62.355 samples/sec: 16.422 %comms: 0.0028914427486686654 %optimizer_step 0.05641179146312603 %forward: 23.328301416426463 %backward: 62.62070442385773 [2025-03-27 06:32:45,491] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15869.77 | forward: 145462.49 | backward_microstep: 390479.68 | backward: 390468.37 | backward_inner_microstep: 390451.95 | backward_inner: 390445.58 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.83 | reduce_tied_grads: 0.29 | comms: 18.03 | reduce_grads: 0.19 | step: 351.75 | _step_clipping: 0.10 | _step_step: 350.00 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.422 | iteration 3160/ 143000 | elapsed time per iteration (ms): 62355.1 | learning rate: 5.998E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.063160E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 06:43:10,573] [INFO] [logging.py:60:log_dist] [Rank 0] step=3170, skipped=0, lr=[0.0005997808383604002, 0.0005997808383604002], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3170 loss: 3.0656 iter time (s): 62.508 samples/sec: 16.382 %comms: 0.0028821430848098802 %optimizer_step 0.056317914243912214 %forward: 23.272243910930925 %backward: 62.45442414099372 [2025-03-27 06:43:10,573] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17523.99 | forward: 145469.52 | backward_microstep: 390397.87 | backward: 390388.45 | backward_inner_microstep: 390371.66 | backward_inner: 390363.45 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.79 | reduce_tied_grads: 0.28 | comms: 18.02 | reduce_grads: 0.19 | step: 352.03 | _step_clipping: 0.11 | _step_step: 350.32 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.382 | iteration 3170/ 143000 | elapsed time per iteration (ms): 62508.3 | learning rate: 5.998E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.061459E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 06:53:34,652] [INFO] [logging.py:60:log_dist] [Rank 0] step=3180, skipped=0, lr=[0.0005997783123334923, 0.0005997783123334923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3180 loss: 3.0384 iter time (s): 62.407 samples/sec: 16.408 %comms: 0.0029382758140676117 %optimizer_step 0.05742586942074589 %forward: 23.30660700024876 %backward: 62.56627764927914 [2025-03-27 06:53:34,653] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16509.38 | forward: 145450.44 | backward_microstep: 390471.21 | backward: 390459.78 | backward_inner_microstep: 390443.34 | backward_inner: 390437.00 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.64 | reduce_tied_grads: 0.34 | comms: 18.34 | reduce_grads: 0.20 | step: 358.38 | _step_clipping: 0.11 | _step_step: 356.60 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.408 | iteration 3180/ 143000 | elapsed time per iteration (ms): 62407.9 | learning rate: 5.998E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.050399E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 07:03:57,768] [INFO] [logging.py:60:log_dist] [Rank 0] step=3190, skipped=0, lr=[0.0005997757718379389, 0.0005997757718379389], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3190 loss: 3.0723 iter time (s): 62.311 samples/sec: 16.434 %comms: 0.0029168777660958417 %optimizer_step 0.05616901876434358 %forward: 23.329988828193244 %backward: 62.65651221290959 [2025-03-27 07:03:57,769] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15695.93 | forward: 145371.55 | backward_microstep: 390429.88 | backward: 390419.13 | backward_inner_microstep: 390402.96 | backward_inner: 390396.44 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 18.18 | reduce_grads: 0.20 | step: 349.99 | _step_clipping: 0.11 | _step_step: 348.28 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.434 | iteration 3190/ 143000 | elapsed time per iteration (ms): 62311.6 | learning rate: 5.998E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.051143E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 07:14:22,128] [INFO] [logging.py:60:log_dist] [Rank 0] step=3200, skipped=0, lr=[0.0005997732168738627, 0.0005997732168738627], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3200 loss: 3.0459 iter time (s): 62.435 samples/sec: 16.401 %comms: 0.002877081526746814 %optimizer_step 0.05661114136457885 %forward: 23.310855843211293 %backward: 62.54312171189912 [2025-03-27 07:14:22,128] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16705.93 | forward: 145542.24 | backward_microstep: 390501.50 | backward: 390490.43 | backward_inner_microstep: 390474.07 | backward_inner: 390467.71 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.66 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.19 | step: 353.45 | _step_clipping: 0.11 | _step_step: 351.48 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.401 | iteration 3200/ 143000 | elapsed time per iteration (ms): 62436.0 | learning rate: 5.998E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.057750E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 07:24:47,437] [INFO] [logging.py:60:log_dist] [Rank 0] step=3210, skipped=0, lr=[0.0005997706474413865, 0.0005997706474413865], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3210 loss: 3.0483 iter time (s): 62.530 samples/sec: 16.376 %comms: 0.0028777413827195306 %optimizer_step 0.058597258252649 %forward: 23.2771151744511 %backward: 62.422101023011535 [2025-03-27 07:24:47,438] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17817.71 | forward: 145552.82 | backward_microstep: 390337.73 | backward: 390328.12 | backward_inner_microstep: 390311.49 | backward_inner: 390304.97 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.21 | step: 366.41 | _step_clipping: 0.11 | _step_step: 364.61 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.376 | iteration 3210/ 143000 | elapsed time per iteration (ms): 62531.0 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.042017E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 07:35:12,930] [INFO] [logging.py:60:log_dist] [Rank 0] step=3220, skipped=0, lr=[0.0005997680635406349, 0.0005997680635406349], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3220 loss: 3.0362 iter time (s): 62.549 samples/sec: 16.371 %comms: 0.0028742731761180465 %optimizer_step 0.05858453777056648 %forward: 23.266307982333494 %backward: 62.43073205477822 [2025-03-27 07:35:12,930] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17836.87 | forward: 145527.62 | backward_microstep: 390505.92 | backward: 390495.82 | backward_inner_microstep: 390477.45 | backward_inner: 390471.09 | backward_allreduce_microstep: 9.60 | backward_allreduce: 2.72 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.20 | step: 366.44 | _step_clipping: 0.12 | _step_step: 364.69 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.371 | iteration 3220/ 143000 | elapsed time per iteration (ms): 62549.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.042355E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 07:45:38,625] [INFO] [logging.py:60:log_dist] [Rank 0] step=3230, skipped=0, lr=[0.0005997654651717322, 0.0005997654651717322], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3230 loss: 3.0385 iter time (s): 62.569 samples/sec: 16.366 %comms: 0.00299947019927589 %optimizer_step 0.060975598510486784 %forward: 23.253680907714948 %backward: 62.40300014176113 [2025-03-27 07:45:38,626] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18150.37 | forward: 145495.74 | backward_microstep: 390458.49 | backward: 390448.74 | backward_inner_microstep: 390431.99 | backward_inner: 390425.50 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.79 | reduce_tied_grads: 0.28 | comms: 18.77 | reduce_grads: 0.20 | step: 381.52 | _step_clipping: 0.12 | _step_step: 379.78 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.366 | iteration 3230/ 143000 | elapsed time per iteration (ms): 62569.5 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.046599E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 07:56:04,795] [INFO] [logging.py:60:log_dist] [Rank 0] step=3240, skipped=0, lr=[0.0005997628523348041, 0.0005997628523348041], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3240 loss: 3.0539 iter time (s): 62.616 samples/sec: 16.354 %comms: 0.0028745499948525967 %optimizer_step 0.05721513904914535 %forward: 23.250009554704143 %backward: 62.354282718245976 [2025-03-27 07:56:04,797] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18598.48 | forward: 145583.30 | backward_microstep: 390449.97 | backward: 390440.36 | backward_inner_microstep: 390423.72 | backward_inner: 390417.27 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.75 | reduce_tied_grads: 0.34 | comms: 18.00 | reduce_grads: 0.20 | step: 358.26 | _step_clipping: 0.13 | _step_step: 356.50 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.353 | iteration 3240/ 143000 | elapsed time per iteration (ms): 62617.1 | learning rate: 5.998E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.040982E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 08:06:29,758] [INFO] [logging.py:60:log_dist] [Rank 0] step=3250, skipped=0, lr=[0.0005997602250299764, 0.0005997602250299764], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3250 loss: 3.0425 iter time (s): 62.496 samples/sec: 16.385 %comms: 0.0028717908505483604 %optimizer_step 0.06393903444962147 %forward: 23.269473175818714 %backward: 62.461311943969655 [2025-03-27 08:06:29,759] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17582.18 | forward: 145424.02 | backward_microstep: 390365.96 | backward: 390355.85 | backward_inner_microstep: 390339.15 | backward_inner: 390332.46 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.73 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.20 | step: 399.59 | _step_clipping: 0.11 | _step_step: 397.88 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.385 | iteration 3250/ 143000 | elapsed time per iteration (ms): 62496.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.034168E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 08:16:55,167] [INFO] [logging.py:60:log_dist] [Rank 0] step=3260, skipped=0, lr=[0.0005997575832573761, 0.0005997575832573761], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3260 loss: 3.0425 iter time (s): 62.540 samples/sec: 16.373 %comms: 0.002892230830240629 %optimizer_step 0.06145412007099475 %forward: 23.26592717450146 %backward: 62.41552993437198 [2025-03-27 08:16:55,168] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18036.05 | forward: 145505.84 | backward_microstep: 390358.10 | backward: 390348.68 | backward_inner_microstep: 390332.09 | backward_inner: 390325.74 | backward_allreduce_microstep: 7.92 | backward_allreduce: 2.72 | reduce_tied_grads: 0.33 | comms: 18.09 | reduce_grads: 0.21 | step: 384.34 | _step_clipping: 0.12 | _step_step: 382.50 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 16.373 | iteration 3260/ 143000 | elapsed time per iteration (ms): 62540.9 | learning rate: 5.998E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.052592E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 08:27:19,429] [INFO] [logging.py:60:log_dist] [Rank 0] step=3270, skipped=0, lr=[0.0005997549270171308, 0.0005997549270171308], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3270 loss: 3.0255 iter time (s): 62.426 samples/sec: 16.404 %comms: 0.0028767646958052936 %optimizer_step 0.057579558885403934 %forward: 23.292662887054206 %backward: 62.5337034536074 [2025-03-27 08:27:19,430] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16977.61 | forward: 145406.06 | backward_microstep: 390382.12 | backward: 390370.97 | backward_inner_microstep: 390354.69 | backward_inner: 390348.23 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.64 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.19 | step: 359.44 | _step_clipping: 0.12 | _step_step: 357.73 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.403 | iteration 3270/ 143000 | elapsed time per iteration (ms): 62426.2 | learning rate: 5.998E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.036237E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 08:37:42,159] [INFO] [logging.py:60:log_dist] [Rank 0] step=3280, skipped=0, lr=[0.0005997522563093684, 0.0005997522563093684], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3280 loss: 3.0236 iter time (s): 62.272 samples/sec: 16.444 %comms: 0.0029234346373288706 %optimizer_step 0.05691813183694003 %forward: 23.346344258041682 %backward: 62.684281664965646 [2025-03-27 08:37:42,159] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15533.58 | forward: 145383.28 | backward_microstep: 390362.85 | backward: 390350.03 | backward_inner_microstep: 390333.82 | backward_inner: 390327.52 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.65 | reduce_tied_grads: 0.33 | comms: 18.20 | reduce_grads: 0.19 | step: 354.44 | _step_clipping: 0.12 | _step_step: 352.71 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.444 | iteration 3280/ 143000 | elapsed time per iteration (ms): 62272.9 | learning rate: 5.998E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.043547E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 08:48:09,322] [INFO] [logging.py:60:log_dist] [Rank 0] step=3290, skipped=0, lr=[0.0005997495711342183, 0.0005997495711342183], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3290 loss: 3.0454 iter time (s): 62.716 samples/sec: 16.328 %comms: 0.0028826941577712664 %optimizer_step 0.05602274258615115 %forward: 23.185444104070257 %backward: 62.24137269692015 [2025-03-27 08:48:09,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19983.68 | forward: 145409.33 | backward_microstep: 390360.33 | backward: 390351.65 | backward_inner_microstep: 390335.71 | backward_inner: 390329.46 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.18 | step: 351.35 | _step_clipping: 0.12 | _step_step: 349.71 | _step_zero_grad: 0.48 | _step_check_overflow: 0.48 samples/sec: 16.327 | iteration 3290/ 143000 | elapsed time per iteration (ms): 62716.3 | learning rate: 5.997E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.029783E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 08:58:35,150] [INFO] [logging.py:60:log_dist] [Rank 0] step=3300, skipped=0, lr=[0.0005997468714918094, 0.0005997468714918094], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3300 loss: 3.0365 iter time (s): 62.582 samples/sec: 16.362 %comms: 0.00294290720702401 %optimizer_step 0.05620538271443039 %forward: 23.245906613721605 %backward: 62.39662585082545 [2025-03-27 08:58:35,150] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18416.65 | forward: 145477.98 | backward_microstep: 390503.93 | backward: 390491.76 | backward_inner_microstep: 390473.47 | backward_inner: 390467.03 | backward_allreduce_microstep: 9.57 | backward_allreduce: 2.72 | reduce_tied_grads: 0.31 | comms: 18.42 | reduce_grads: 0.19 | step: 351.75 | _step_clipping: 0.14 | _step_step: 349.87 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.362 | iteration 3300/ 143000 | elapsed time per iteration (ms): 62582.8 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.029624E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 09:09:03,694] [INFO] [logging.py:60:log_dist] [Rank 0] step=3310, skipped=0, lr=[0.0005997441573822725, 0.0005997441573822725], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3310 loss: 3.0391 iter time (s): 62.854 samples/sec: 16.292 %comms: 0.002878904895651781 %optimizer_step 0.058059154515315785 %forward: 23.168717039820557 %backward: 62.13755963450394 [2025-03-27 09:09:03,694] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20918.42 | forward: 145624.23 | backward_microstep: 390572.32 | backward: 390558.29 | backward_inner_microstep: 390541.40 | backward_inner: 390534.96 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.20 | step: 364.92 | _step_clipping: 0.13 | _step_step: 363.15 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.292 | iteration 3310/ 143000 | elapsed time per iteration (ms): 62854.4 | learning rate: 5.997E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.025507E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 09:19:33,655] [INFO] [logging.py:60:log_dist] [Rank 0] step=3320, skipped=0, lr=[0.0005997414288057386, 0.0005997414288057386], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3320 loss: 3.0310 iter time (s): 62.996 samples/sec: 16.255 %comms: 0.002845403241998227 %optimizer_step 0.057340730236768604 %forward: 23.117994000418328 %backward: 61.99867154237928 [2025-03-27 09:19:33,656] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22338.35 | forward: 145633.17 | backward_microstep: 390576.68 | backward: 390564.29 | backward_inner_microstep: 390547.68 | backward_inner: 390541.43 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.19 | step: 361.22 | _step_clipping: 0.12 | _step_step: 359.51 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.255 | iteration 3320/ 143000 | elapsed time per iteration (ms): 62996.1 | learning rate: 5.997E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 3.036490E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 09:30:01,810] [INFO] [logging.py:60:log_dist] [Rank 0] step=3330, skipped=0, lr=[0.0005997386857623391, 0.0005997386857623391], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3330 loss: 3.0438 iter time (s): 62.815 samples/sec: 16.302 %comms: 0.0028684639238022724 %optimizer_step 0.05585716142176277 %forward: 23.14244871666435 %backward: 62.14585542692607 [2025-03-27 09:30:01,811] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21068.09 | forward: 145369.21 | backward_microstep: 390377.31 | backward: 390368.96 | backward_inner_microstep: 390353.15 | backward_inner: 390347.09 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.18 | step: 350.87 | _step_clipping: 0.12 | _step_step: 349.16 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.302 | iteration 3330/ 143000 | elapsed time per iteration (ms): 62815.5 | learning rate: 5.997E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.031130E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 09:40:30,431] [INFO] [logging.py:60:log_dist] [Rank 0] step=3340, skipped=0, lr=[0.0005997359282522067, 0.0005997359282522067], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3340 loss: 3.0358 iter time (s): 62.862 samples/sec: 16.290 %comms: 0.0028849626737639114 %optimizer_step 0.055848993578125804 %forward: 23.13175034188031 %backward: 62.106863985951435 [2025-03-27 09:40:30,432] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21450.10 | forward: 145409.66 | backward_microstep: 390423.82 | backward: 390413.09 | backward_inner_microstep: 390397.13 | backward_inner: 390391.01 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.58 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.19 | step: 351.08 | _step_clipping: 0.14 | _step_step: 349.29 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.290 | iteration 3340/ 143000 | elapsed time per iteration (ms): 62862.1 | learning rate: 5.997E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.033077E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 09:50:53,047] [INFO] [logging.py:60:log_dist] [Rank 0] step=3350, skipped=0, lr=[0.0005997331562754743, 0.0005997331562754743], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3350 loss: 3.0174 iter time (s): 62.261 samples/sec: 16.447 %comms: 0.002881078466604179 %optimizer_step 0.057303083353020784 %forward: 23.372630063671362 %backward: 62.71088279901234 [2025-03-27 09:50:53,048] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15336.92 | forward: 145520.46 | backward_microstep: 390454.64 | backward: 390444.58 | backward_inner_microstep: 390428.31 | backward_inner: 390422.04 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.19 | step: 356.78 | _step_clipping: 0.14 | _step_step: 355.05 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.447 | iteration 3350/ 143000 | elapsed time per iteration (ms): 62261.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.019131E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 10:01:22,535] [INFO] [logging.py:60:log_dist] [Rank 0] step=3360, skipped=0, lr=[0.0005997303698322757, 0.0005997303698322757], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3360 loss: 3.0264 iter time (s): 62.948 samples/sec: 16.267 %comms: 0.002846673290645902 %optimizer_step 0.05521922755622711 %forward: 23.091610606675818 %backward: 61.9943659677289 [2025-03-27 10:01:22,535] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22602.27 | forward: 145357.58 | backward_microstep: 390251.11 | backward: 390243.51 | backward_inner_microstep: 390228.08 | backward_inner: 390222.05 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 347.60 | _step_clipping: 0.12 | _step_step: 345.89 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.267 | iteration 3360/ 143000 | elapsed time per iteration (ms): 62948.8 | learning rate: 5.997E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 3.022941E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 10:11:47,735] [INFO] [logging.py:60:log_dist] [Rank 0] step=3370, skipped=0, lr=[0.0005997275689227455, 0.0005997275689227455], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3370 loss: 3.0210 iter time (s): 62.519 samples/sec: 16.379 %comms: 0.0028551357454667777 %optimizer_step 0.05518253706462354 %forward: 23.233401613200392 %backward: 62.41658980085663 [2025-03-27 10:11:47,735] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18463.19 | forward: 145254.02 | backward_microstep: 390231.76 | backward: 390225.28 | backward_inner_microstep: 390210.24 | backward_inner: 390204.60 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 345.00 | _step_clipping: 0.12 | _step_step: 343.36 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.379 | iteration 3370/ 143000 | elapsed time per iteration (ms): 62520.0 | learning rate: 5.997E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.023856E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 10:22:16,223] [INFO] [logging.py:60:log_dist] [Rank 0] step=3380, skipped=0, lr=[0.0005997247535470187, 0.0005997247535470187], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3380 loss: 3.0211 iter time (s): 62.848 samples/sec: 16.293 %comms: 0.0028606114183297713 %optimizer_step 0.05685502655356754 %forward: 23.114208499482015 %backward: 62.09810274582463 [2025-03-27 10:22:16,223] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21674.56 | forward: 145268.66 | backward_microstep: 390282.38 | backward: 390275.44 | backward_inner_microstep: 390260.26 | backward_inner: 390254.55 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.51 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.19 | step: 357.32 | _step_clipping: 0.14 | _step_step: 355.53 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.293 | iteration 3380/ 143000 | elapsed time per iteration (ms): 62848.8 | learning rate: 5.997E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 3.015042E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 10:32:42,329] [INFO] [logging.py:60:log_dist] [Rank 0] step=3390, skipped=0, lr=[0.0005997219237052313, 0.0005997219237052313], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3390 loss: 3.0161 iter time (s): 62.610 samples/sec: 16.355 %comms: 0.0028714942070996344 %optimizer_step 0.0582248790968592 %forward: 23.22849400102736 %backward: 62.338995162347366 [2025-03-27 10:32:42,329] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19140.11 | forward: 145433.64 | backward_microstep: 390312.67 | backward: 390304.55 | backward_inner_microstep: 390288.85 | backward_inner: 390282.68 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.63 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.18 | step: 364.55 | _step_clipping: 0.12 | _step_step: 362.61 | _step_zero_grad: 0.54 | _step_check_overflow: 0.68 samples/sec: 16.355 | iteration 3390/ 143000 | elapsed time per iteration (ms): 62610.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.016069E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 10:43:10,185] [INFO] [logging.py:60:log_dist] [Rank 0] step=3400, skipped=0, lr=[0.0005997190793975199, 0.0005997190793975199], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3400 loss: 2.9931 iter time (s): 62.785 samples/sec: 16.310 %comms: 0.002881792599262156 %optimizer_step 0.05562179835586778 %forward: 23.14926147308158 %backward: 62.15794890191009 [2025-03-27 10:43:10,186] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21058.99 | forward: 145342.74 | backward_microstep: 390267.35 | backward: 390258.96 | backward_inner_microstep: 390243.27 | backward_inner: 390237.25 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.18 | step: 349.22 | _step_clipping: 0.13 | _step_step: 347.45 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.309 | iteration 3400/ 143000 | elapsed time per iteration (ms): 62785.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 3.015569E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 10:53:42,112] [INFO] [logging.py:60:log_dist] [Rank 0] step=3410, skipped=0, lr=[0.0005997162206240217, 0.0005997162206240217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3410 loss: 3.0204 iter time (s): 63.192 samples/sec: 16.205 %comms: 0.002836099039995227 %optimizer_step 0.054354062589858775 %forward: 23.032324918157 %backward: 61.77851822109724 [2025-03-27 10:53:42,113] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24787.30 | forward: 145546.25 | backward_microstep: 390401.93 | backward: 390391.85 | backward_inner_microstep: 390376.10 | backward_inner: 390370.06 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.62 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 343.48 | _step_clipping: 0.12 | _step_step: 341.88 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.204 | iteration 3410/ 143000 | elapsed time per iteration (ms): 63192.7 | learning rate: 5.997E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 3.017211E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 11:04:08,912] [INFO] [logging.py:60:log_dist] [Rank 0] step=3420, skipped=0, lr=[0.0005997133473848748, 0.0005997133473848748], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3420 loss: 3.0037 iter time (s): 62.679 samples/sec: 16.337 %comms: 0.0028818956240022806 %optimizer_step 0.05722162683370856 %forward: 23.209930478269648 %backward: 62.27105274154921 [2025-03-27 11:04:08,912] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19863.22 | forward: 145478.42 | backward_microstep: 390319.20 | backward: 390311.14 | backward_inner_microstep: 390295.22 | backward_inner: 390289.14 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.21 | step: 358.66 | _step_clipping: 0.14 | _step_step: 356.90 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.337 | iteration 3420/ 143000 | elapsed time per iteration (ms): 62679.9 | learning rate: 5.997E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.009679E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 11:14:35,743] [INFO] [logging.py:60:log_dist] [Rank 0] step=3430, skipped=0, lr=[0.0005997104596802178, 0.0005997104596802178], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3430 loss: 3.0339 iter time (s): 62.683 samples/sec: 16.336 %comms: 0.0028661558876695655 %optimizer_step 0.05494119668545157 %forward: 23.2348629446733 %backward: 62.27152794395337 [2025-03-27 11:14:35,743] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19744.66 | forward: 145642.03 | backward_microstep: 390343.09 | backward: 390333.77 | backward_inner_microstep: 390317.76 | backward_inner: 390311.62 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.18 | step: 344.39 | _step_clipping: 0.13 | _step_step: 342.68 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.336 | iteration 3430/ 143000 | elapsed time per iteration (ms): 62683.1 | learning rate: 5.997E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 3.020918E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 11:25:00,716] [INFO] [logging.py:60:log_dist] [Rank 0] step=3440, skipped=0, lr=[0.0005997075575101899, 0.0005997075575101899], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3440 loss: 2.9960 iter time (s): 62.497 samples/sec: 16.385 %comms: 0.002916828655267745 %optimizer_step 0.056474562225864396 %forward: 23.25430050046461 %backward: 62.45443871981846 [2025-03-27 11:25:00,717] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18207.63 | forward: 145331.94 | backward_microstep: 390329.11 | backward: 390320.27 | backward_inner_microstep: 390303.96 | backward_inner: 390297.76 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.80 | reduce_tied_grads: 0.30 | comms: 18.23 | reduce_grads: 0.18 | step: 352.95 | _step_clipping: 0.14 | _step_step: 351.16 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.385 | iteration 3440/ 143000 | elapsed time per iteration (ms): 62497.4 | learning rate: 5.997E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.010283E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 11:35:25,074] [INFO] [logging.py:60:log_dist] [Rank 0] step=3450, skipped=0, lr=[0.0005997046408749315, 0.0005997046408749315], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3450 loss: 3.0145 iter time (s): 62.435 samples/sec: 16.401 %comms: 0.0028634564681986018 %optimizer_step 0.055343267202226376 %forward: 23.280673485755603 %backward: 62.501328242689745 [2025-03-27 11:35:25,075] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17710.57 | forward: 145353.41 | backward_microstep: 390235.65 | backward: 390228.46 | backward_inner_microstep: 390213.14 | backward_inner: 390207.36 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 345.54 | _step_clipping: 0.11 | _step_step: 343.81 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.401 | iteration 3450/ 143000 | elapsed time per iteration (ms): 62435.8 | learning rate: 5.997E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.003837E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 11:45:50,798] [INFO] [logging.py:60:log_dist] [Rank 0] step=3460, skipped=0, lr=[0.0005997017097745832, 0.0005997017097745832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3460 loss: 3.0070 iter time (s): 62.572 samples/sec: 16.365 %comms: 0.002860370698570912 %optimizer_step 0.055296232611616994 %forward: 23.244839804041504 %backward: 62.36360543379701 [2025-03-27 11:45:50,798] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19013.77 | forward: 145447.07 | backward_microstep: 390227.25 | backward: 390220.09 | backward_inner_microstep: 390204.59 | backward_inner: 390198.73 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.19 | step: 346.00 | _step_clipping: 0.12 | _step_step: 344.32 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.365 | iteration 3460/ 143000 | elapsed time per iteration (ms): 62572.3 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.005034E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 11:56:15,151] [INFO] [logging.py:60:log_dist] [Rank 0] step=3470, skipped=0, lr=[0.0005996987642092865, 0.0005996987642092865], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3470 loss: 2.9958 iter time (s): 62.435 samples/sec: 16.401 %comms: 0.0029096474373765647 %optimizer_step 0.05612841460415802 %forward: 23.37567584767134 %backward: 62.55680154384874 [2025-03-27 11:56:15,151] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16715.74 | forward: 145945.39 | backward_microstep: 390583.47 | backward: 390571.67 | backward_inner_microstep: 390555.03 | backward_inner: 390548.61 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.21 | step: 350.44 | _step_clipping: 0.14 | _step_step: 348.44 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.401 | iteration 3470/ 143000 | elapsed time per iteration (ms): 62435.3 | learning rate: 5.997E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 3.002660E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 12:06:38,451] [INFO] [logging.py:60:log_dist] [Rank 0] step=3480, skipped=0, lr=[0.0005996958041791836, 0.0005996958041791836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3480 loss: 3.0123 iter time (s): 62.329 samples/sec: 16.429 %comms: 0.0029025113810653763 %optimizer_step 0.05575683051583631 %forward: 23.33796479745454 %backward: 62.63980400492773 [2025-03-27 12:06:38,451] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16305.07 | forward: 145464.32 | backward_microstep: 390440.75 | backward: 390430.63 | backward_inner_microstep: 390412.33 | backward_inner: 390405.99 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.09 | reduce_grads: 0.20 | step: 347.53 | _step_clipping: 0.13 | _step_step: 345.84 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.429 | iteration 3480/ 143000 | elapsed time per iteration (ms): 62330.0 | learning rate: 5.997E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 3.010346E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 12:17:03,721] [INFO] [logging.py:60:log_dist] [Rank 0] step=3490, skipped=0, lr=[0.0005996928296844172, 0.0005996928296844172], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3490 loss: 3.0023 iter time (s): 62.526 samples/sec: 16.377 %comms: 0.002879832918397105 %optimizer_step 0.05721893410591441 %forward: 23.29347397613426 %backward: 62.446603231429386 [2025-03-27 12:17:03,721] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18071.02 | forward: 145645.75 | backward_microstep: 390466.90 | backward: 390456.23 | backward_inner_microstep: 390439.19 | backward_inner: 390432.66 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.82 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.21 | step: 357.77 | _step_clipping: 0.12 | _step_step: 356.05 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.377 | iteration 3490/ 143000 | elapsed time per iteration (ms): 62527.0 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.005369E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 12:27:29,439] [INFO] [logging.py:60:log_dist] [Rank 0] step=3500, skipped=0, lr=[0.0005996898407251309, 0.0005996898407251309], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3500 loss: 2.9875 iter time (s): 62.571 samples/sec: 16.365 %comms: 0.002893876540778829 %optimizer_step 0.05755307614660043 %forward: 23.283639068752443 %backward: 62.430503810359816 [2025-03-27 12:27:29,440] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18275.60 | forward: 145687.26 | backward_microstep: 390643.96 | backward: 390631.77 | backward_inner_microstep: 390614.74 | backward_inner: 390608.17 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 360.11 | _step_clipping: 0.11 | _step_step: 358.24 | _step_zero_grad: 0.49 | _step_check_overflow: 0.69 samples/sec: 16.365 | iteration 3500/ 143000 | elapsed time per iteration (ms): 62571.9 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.002731E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 12:37:56,959] [INFO] [logging.py:60:log_dist] [Rank 0] step=3510, skipped=0, lr=[0.0005996868373014693, 0.0005996868373014693], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3510 loss: 3.0113 iter time (s): 62.751 samples/sec: 16.318 %comms: 0.0033141564000276334 %optimizer_step 0.05774055439086526 %forward: 23.238187810663387 %backward: 62.26286612644441 [2025-03-27 12:37:56,960] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19873.11 | forward: 145822.74 | backward_microstep: 390721.26 | backward: 390707.83 | backward_inner_microstep: 390690.40 | backward_inner: 390683.71 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.86 | reduce_tied_grads: 0.31 | comms: 20.80 | reduce_grads: 0.20 | step: 362.33 | _step_clipping: 0.12 | _step_step: 360.52 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.318 | iteration 3510/ 143000 | elapsed time per iteration (ms): 62751.9 | learning rate: 5.997E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.993892E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 12:48:21,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=3520, skipped=0, lr=[0.0005996838194135769, 0.0005996838194135769], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3520 loss: 3.0226 iter time (s): 62.501 samples/sec: 16.384 %comms: 0.002878331471509419 %optimizer_step 0.05805032491834344 %forward: 23.287730493464554 %backward: 62.47456658647933 [2025-03-27 12:48:21,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17965.38 | forward: 145550.76 | backward_microstep: 390485.14 | backward: 390472.60 | backward_inner_microstep: 390455.55 | backward_inner: 390449.07 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.78 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.21 | step: 362.82 | _step_clipping: 0.12 | _step_step: 361.11 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.384 | iteration 3520/ 143000 | elapsed time per iteration (ms): 62501.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.009101E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 12:58:44,608] [INFO] [logging.py:60:log_dist] [Rank 0] step=3530, skipped=0, lr=[0.0005996807870615995, 0.0005996807870615995], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3530 loss: 3.0048 iter time (s): 62.263 samples/sec: 16.446 %comms: 0.0029035157155615035 %optimizer_step 0.056200385093452106 %forward: 23.37143008765156 %backward: 62.721402660753 [2025-03-27 12:58:44,609] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15579.77 | forward: 145516.96 | backward_microstep: 390531.62 | backward: 390520.73 | backward_inner_microstep: 390504.60 | backward_inner: 390498.33 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.65 | reduce_tied_grads: 0.29 | comms: 18.08 | reduce_grads: 0.19 | step: 349.92 | _step_clipping: 0.11 | _step_step: 348.23 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.446 | iteration 3530/ 143000 | elapsed time per iteration (ms): 62263.3 | learning rate: 5.997E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.996076E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 13:09:10,489] [INFO] [logging.py:60:log_dist] [Rank 0] step=3540, skipped=0, lr=[0.0005996777402456836, 0.0005996777402456836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3540 loss: 3.0039 iter time (s): 62.588 samples/sec: 16.361 %comms: 0.0028702781504052395 %optimizer_step 0.057276044696916995 %forward: 23.27203109892207 %backward: 62.388179751776384 [2025-03-27 13:09:10,489] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18757.44 | forward: 145653.90 | backward_microstep: 390482.74 | backward: 390472.23 | backward_inner_microstep: 390455.32 | backward_inner: 390448.94 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.19 | step: 358.48 | _step_clipping: 0.10 | _step_step: 356.79 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.361 | iteration 3540/ 143000 | elapsed time per iteration (ms): 62588.1 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 3.002881E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 13:19:35,115] [INFO] [logging.py:60:log_dist] [Rank 0] step=3550, skipped=0, lr=[0.0005996746789659762, 0.0005996746789659762], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3550 loss: 3.0025 iter time (s): 62.462 samples/sec: 16.394 %comms: 0.002880244691676741 %optimizer_step 0.057291287716399256 %forward: 23.298861786599026 %backward: 62.5091224034509 [2025-03-27 13:19:35,116] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17720.71 | forward: 145529.39 | backward_microstep: 390454.65 | backward: 390444.58 | backward_inner_microstep: 390427.89 | backward_inner: 390421.54 | backward_allreduce_microstep: 8.01 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.21 | step: 357.85 | _step_clipping: 0.11 | _step_step: 356.14 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.394 | iteration 3550/ 143000 | elapsed time per iteration (ms): 62462.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 3.004356E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 13:29:59,971] [INFO] [logging.py:60:log_dist] [Rank 0] step=3560, skipped=0, lr=[0.0005996716032226249, 0.0005996716032226249], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3560 loss: 2.9786 iter time (s): 62.485 samples/sec: 16.388 %comms: 0.0028899445585961734 %optimizer_step 0.056881122008101 %forward: 23.30795593804324 %backward: 62.493896895156475 [2025-03-27 13:29:59,971] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17817.80 | forward: 145639.80 | backward_microstep: 390504.03 | backward: 390493.21 | backward_inner_microstep: 390476.46 | backward_inner: 390470.10 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.22 | step: 355.42 | _step_clipping: 0.11 | _step_step: 353.71 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.388 | iteration 3560/ 143000 | elapsed time per iteration (ms): 62485.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.988933E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 13:40:26,351] [INFO] [logging.py:60:log_dist] [Rank 0] step=3570, skipped=0, lr=[0.0005996685130157784, 0.0005996685130157784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3570 loss: 2.9727 iter time (s): 62.637 samples/sec: 16.348 %comms: 0.002886529062280673 %optimizer_step 0.05811783708631555 %forward: 23.291213220013976 %backward: 62.37053997088565 [2025-03-27 13:40:26,351] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18900.02 | forward: 145890.15 | backward_microstep: 390685.86 | backward: 390672.96 | backward_inner_microstep: 390656.23 | backward_inner: 390649.71 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.71 | reduce_tied_grads: 0.29 | comms: 18.08 | reduce_grads: 0.20 | step: 364.04 | _step_clipping: 0.11 | _step_step: 362.22 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 16.348 | iteration 3570/ 143000 | elapsed time per iteration (ms): 62638.0 | learning rate: 5.997E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.987257E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 13:50:56,185] [INFO] [logging.py:60:log_dist] [Rank 0] step=3580, skipped=0, lr=[0.0005996654083455856, 0.0005996654083455856], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3580 loss: 3.0079 iter time (s): 62.983 samples/sec: 16.258 %comms: 0.00290291209916524 %optimizer_step 0.056380794419860235 %forward: 23.12069490472045 %backward: 62.02157088318431 [2025-03-27 13:50:56,185] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22699.23 | forward: 145620.72 | backward_microstep: 390641.05 | backward: 390629.52 | backward_inner_microstep: 390612.67 | backward_inner: 390606.03 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.78 | reduce_tied_grads: 0.32 | comms: 18.28 | reduce_grads: 0.20 | step: 355.10 | _step_clipping: 0.13 | _step_step: 353.29 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 16.258 | iteration 3580/ 143000 | elapsed time per iteration (ms): 62983.4 | learning rate: 5.997E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.999067E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 14:01:28,688] [INFO] [logging.py:60:log_dist] [Rank 0] step=3590, skipped=0, lr=[0.0005996622892121966, 0.0005996622892121966], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3590 loss: 2.9990 iter time (s): 63.250 samples/sec: 16.190 %comms: 0.002867258672609916 %optimizer_step 0.05589531648349267 %forward: 23.02796880580581 %backward: 61.74649022584483 [2025-03-27 14:01:28,688] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25431.09 | forward: 145651.08 | backward_microstep: 390555.14 | backward: 390544.36 | backward_inner_microstep: 390527.90 | backward_inner: 390521.49 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.69 | reduce_tied_grads: 0.33 | comms: 18.14 | reduce_grads: 0.20 | step: 353.54 | _step_clipping: 0.17 | _step_step: 351.70 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.190 | iteration 3590/ 143000 | elapsed time per iteration (ms): 63250.3 | learning rate: 5.997E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 3.002080E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 14:11:53,600] [INFO] [logging.py:60:log_dist] [Rank 0] step=3600, skipped=0, lr=[0.0005996591556157616, 0.0005996591556157616], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3600 loss: 2.9749 iter time (s): 62.491 samples/sec: 16.386 %comms: 0.00290353254750831 %optimizer_step 0.05841333822059863 %forward: 23.2816915901951 %backward: 62.47297911635694 [2025-03-27 14:11:53,601] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18199.88 | forward: 145488.85 | backward_microstep: 390406.29 | backward: 390397.83 | backward_inner_microstep: 390381.73 | backward_inner: 390375.46 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.65 | reduce_tied_grads: 0.29 | comms: 18.14 | reduce_grads: 0.20 | step: 365.03 | _step_clipping: 0.13 | _step_step: 363.13 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 16.386 | iteration 3600/ 143000 | elapsed time per iteration (ms): 62491.2 | learning rate: 5.997E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.990639E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 14:22:17,691] [INFO] [logging.py:60:log_dist] [Rank 0] step=3610, skipped=0, lr=[0.0005996560075564321, 0.0005996560075564321], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3610 loss: 3.0023 iter time (s): 62.409 samples/sec: 16.408 %comms: 0.0028929108754260006 %optimizer_step 0.05840065978676595 %forward: 23.304837194041546 %backward: 62.54737011668817 [2025-03-27 14:22:17,692] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17482.45 | forward: 145442.19 | backward_microstep: 390358.91 | backward: 390349.28 | backward_inner_microstep: 390333.42 | backward_inner: 390327.35 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.20 | step: 364.47 | _step_clipping: 0.13 | _step_step: 362.76 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.408 | iteration 3610/ 143000 | elapsed time per iteration (ms): 62409.1 | learning rate: 5.997E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.981131E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 14:32:43,787] [INFO] [logging.py:60:log_dist] [Rank 0] step=3620, skipped=0, lr=[0.00059965284503436, 0.00059965284503436], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3620 loss: 2.9648 iter time (s): 62.609 samples/sec: 16.355 %comms: 0.0028845232187725447 %optimizer_step 0.05629397555094297 %forward: 23.247279507668537 %backward: 62.36209466271164 [2025-03-27 14:32:43,788] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19273.61 | forward: 145549.05 | backward_microstep: 390452.51 | backward: 390443.25 | backward_inner_microstep: 390426.65 | backward_inner: 390420.30 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.75 | reduce_tied_grads: 0.33 | comms: 18.06 | reduce_grads: 0.19 | step: 352.45 | _step_clipping: 0.12 | _step_step: 350.78 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.355 | iteration 3620/ 143000 | elapsed time per iteration (ms): 62609.6 | learning rate: 5.997E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.980405E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 14:43:11,085] [INFO] [logging.py:60:log_dist] [Rank 0] step=3630, skipped=0, lr=[0.000599649668049698, 0.000599649668049698], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3630 loss: 2.9796 iter time (s): 62.729 samples/sec: 16.324 %comms: 0.0028850834281537634 %optimizer_step 0.05777612551670009 %forward: 23.229351082237258 %backward: 62.25114194238296 [2025-03-27 14:43:11,085] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20289.37 | forward: 145715.70 | backward_microstep: 390506.72 | backward: 390496.00 | backward_inner_microstep: 390479.25 | backward_inner: 390472.85 | backward_allreduce_microstep: 7.82 | backward_allreduce: 2.70 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.19 | step: 362.42 | _step_clipping: 0.11 | _step_step: 360.63 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.324 | iteration 3630/ 143000 | elapsed time per iteration (ms): 62729.7 | learning rate: 5.996E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.976443E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 14:53:38,236] [INFO] [logging.py:60:log_dist] [Rank 0] step=3640, skipped=0, lr=[0.0005996464766025992, 0.0005996464766025992], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3640 loss: 3.0113 iter time (s): 62.715 samples/sec: 16.328 %comms: 0.0028774622395247402 %optimizer_step 0.057728758288909866 %forward: 23.234742351801074 %backward: 62.26346105167381 [2025-03-27 14:53:38,237] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20163.37 | forward: 145715.86 | backward_microstep: 390493.55 | backward: 390483.08 | backward_inner_microstep: 390466.54 | backward_inner: 390460.02 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 18.05 | reduce_grads: 0.21 | step: 362.04 | _step_clipping: 0.11 | _step_step: 360.33 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.328 | iteration 3640/ 143000 | elapsed time per iteration (ms): 62715.2 | learning rate: 5.996E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.992182E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 15:04:04,105] [INFO] [logging.py:60:log_dist] [Rank 0] step=3650, skipped=0, lr=[0.0005996432706932178, 0.0005996432706932178], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3650 loss: 2.9882 iter time (s): 62.586 samples/sec: 16.361 %comms: 0.002898295304696084 %optimizer_step 0.05765334184777895 %forward: 23.264740141285593 %backward: 62.39368518587032 [2025-03-27 15:04:04,105] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19010.96 | forward: 145605.44 | backward_microstep: 390507.96 | backward: 390499.08 | backward_inner_microstep: 390482.81 | backward_inner: 390476.28 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.19 | step: 360.83 | _step_clipping: 0.11 | _step_step: 358.94 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.361 | iteration 3650/ 143000 | elapsed time per iteration (ms): 62586.9 | learning rate: 5.996E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.990667E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 15:14:28,966] [INFO] [logging.py:60:log_dist] [Rank 0] step=3660, skipped=0, lr=[0.0005996400503217086, 0.0005996400503217086], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3660 loss: 2.9909 iter time (s): 62.486 samples/sec: 16.388 %comms: 0.0029040000768244217 %optimizer_step 0.05675267558722499 %forward: 23.256847597764615 %backward: 62.471695032637534 [2025-03-27 15:14:28,966] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18459.42 | forward: 145321.65 | backward_microstep: 390366.00 | backward: 390357.72 | backward_inner_microstep: 390342.22 | backward_inner: 390336.16 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.53 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.20 | step: 354.62 | _step_clipping: 0.12 | _step_step: 352.95 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.388 | iteration 3660/ 143000 | elapsed time per iteration (ms): 62486.1 | learning rate: 5.996E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.977642E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 15:24:55,446] [INFO] [logging.py:60:log_dist] [Rank 0] step=3670, skipped=0, lr=[0.0005996368154882269, 0.0005996368154882269], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3670 loss: 2.9561 iter time (s): 62.647 samples/sec: 16.345 %comms: 0.0028657425574435446 %optimizer_step 0.06018127873527865 %forward: 23.19579833405074 %backward: 62.30732146082209 [2025-03-27 15:24:55,447] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20092.32 | forward: 145315.85 | backward_microstep: 390347.26 | backward: 390339.72 | backward_inner_microstep: 390324.25 | backward_inner: 390318.21 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.18 | step: 377.02 | _step_clipping: 0.15 | _step_step: 375.25 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.345 | iteration 3670/ 143000 | elapsed time per iteration (ms): 62648.0 | learning rate: 5.996E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.976938E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 15:35:24,587] [INFO] [logging.py:60:log_dist] [Rank 0] step=3680, skipped=0, lr=[0.0005996335661929289, 0.0005996335661929289], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3680 loss: 2.9663 iter time (s): 62.914 samples/sec: 16.276 %comms: 0.002842404939215363 %optimizer_step 0.05497826207867663 %forward: 23.0881494665114 %backward: 62.02011607397986 [2025-03-27 15:35:24,588] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23053.66 | forward: 145255.80 | backward_microstep: 390198.33 | backward: 390190.72 | backward_inner_microstep: 390175.40 | backward_inner: 390169.63 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.53 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 345.89 | _step_clipping: 0.14 | _step_step: 344.26 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.276 | iteration 3680/ 143000 | elapsed time per iteration (ms): 62914.1 | learning rate: 5.996E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.967241E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 15:45:52,272] [INFO] [logging.py:60:log_dist] [Rank 0] step=3690, skipped=0, lr=[0.0005996303024359713, 0.0005996303024359713], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3690 loss: 2.9655 iter time (s): 62.768 samples/sec: 16.314 %comms: 0.002895681461407504 %optimizer_step 0.05602031072926694 %forward: 23.130576380870984 %backward: 62.15229401386233 [2025-03-27 15:45:52,273] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21783.61 | forward: 145185.91 | backward_microstep: 390123.62 | backward: 390117.27 | backward_inner_microstep: 390102.22 | backward_inner: 390096.55 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.48 | reduce_tied_grads: 0.31 | comms: 18.18 | reduce_grads: 0.18 | step: 351.63 | _step_clipping: 0.12 | _step_step: 349.85 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.314 | iteration 3690/ 143000 | elapsed time per iteration (ms): 62768.5 | learning rate: 5.996E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.963347E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 15:56:19,718] [INFO] [logging.py:60:log_dist] [Rank 0] step=3700, skipped=0, lr=[0.0005996270242175117, 0.0005996270242175117], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3700 loss: 2.9511 iter time (s): 62.744 samples/sec: 16.320 %comms: 0.0028995605858369985 %optimizer_step 0.056341083707506776 %forward: 23.151031993138663 %backward: 62.19114001342276 [2025-03-27 15:56:19,719] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21358.91 | forward: 145258.86 | backward_microstep: 390218.98 | backward: 390212.17 | backward_inner_microstep: 390196.62 | backward_inner: 390190.74 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.33 | comms: 18.19 | reduce_grads: 0.19 | step: 353.51 | _step_clipping: 0.12 | _step_step: 351.78 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.320 | iteration 3700/ 143000 | elapsed time per iteration (ms): 62744.6 | learning rate: 5.996E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.963367E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 16:06:44,755] [INFO] [logging.py:60:log_dist] [Rank 0] step=3710, skipped=0, lr=[0.0005996237315377085, 0.0005996237315377085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3710 loss: 2.9583 iter time (s): 62.503 samples/sec: 16.383 %comms: 0.0028698818921575322 %optimizer_step 0.05690346856364482 %forward: 23.258712865546975 %backward: 62.44687018905298 [2025-03-27 16:06:44,755] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18723.48 | forward: 145374.24 | backward_microstep: 390321.06 | backward: 390312.49 | backward_inner_microstep: 390295.05 | backward_inner: 390288.86 | backward_allreduce_microstep: 9.35 | backward_allreduce: 2.53 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.19 | step: 355.66 | _step_clipping: 0.13 | _step_step: 353.94 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.383 | iteration 3710/ 143000 | elapsed time per iteration (ms): 62503.7 | learning rate: 5.996E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.968405E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 16:17:10,382] [INFO] [logging.py:60:log_dist] [Rank 0] step=3720, skipped=0, lr=[0.0005996204243967204, 0.0005996204243967204], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3720 loss: 2.9640 iter time (s): 62.562 samples/sec: 16.368 %comms: 0.0028795954208753038 %optimizer_step 0.056336444142535635 %forward: 23.245467534737845 %backward: 62.38291995822385 [2025-03-27 16:17:10,383] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19318.01 | forward: 145428.78 | backward_microstep: 390288.35 | backward: 390281.32 | backward_inner_microstep: 390265.71 | backward_inner: 390259.69 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.62 | reduce_tied_grads: 0.35 | comms: 18.02 | reduce_grads: 0.19 | step: 352.45 | _step_clipping: 0.14 | _step_step: 350.70 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.368 | iteration 3720/ 143000 | elapsed time per iteration (ms): 62562.8 | learning rate: 5.996E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.977786E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 16:27:40,620] [INFO] [logging.py:60:log_dist] [Rank 0] step=3730, skipped=0, lr=[0.0005996171027947071, 0.0005996171027947071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3730 loss: 2.9679 iter time (s): 63.023 samples/sec: 16.248 %comms: 0.002846048094556879 %optimizer_step 0.05580027749150507 %forward: 23.07179923819367 %backward: 61.943060029146636 [2025-03-27 16:27:40,620] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23848.33 | forward: 145405.87 | backward_microstep: 390393.99 | backward: 390385.00 | backward_inner_microstep: 390369.35 | backward_inner: 390361.58 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 351.67 | _step_clipping: 0.13 | _step_step: 349.93 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.248 | iteration 3730/ 143000 | elapsed time per iteration (ms): 63023.8 | learning rate: 5.996E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.963393E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 16:38:08,872] [INFO] [logging.py:60:log_dist] [Rank 0] step=3740, skipped=0, lr=[0.0005996137667318289, 0.0005996137667318289], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3740 loss: 2.9811 iter time (s): 62.825 samples/sec: 16.299 %comms: 0.0028843780860495004 %optimizer_step 0.055945436065859576 %forward: 23.16642666608144 %backward: 62.154117788572016 [2025-03-27 16:38:08,872] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21617.90 | forward: 145542.26 | backward_microstep: 390491.40 | backward: 390481.06 | backward_inner_microstep: 390465.09 | backward_inner: 390458.74 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.59 | reduce_tied_grads: 0.29 | comms: 18.12 | reduce_grads: 0.18 | step: 351.48 | _step_clipping: 0.12 | _step_step: 349.77 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.299 | iteration 3740/ 143000 | elapsed time per iteration (ms): 62825.2 | learning rate: 5.996E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.969294E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 16:48:33,316] [INFO] [logging.py:60:log_dist] [Rank 0] step=3750, skipped=0, lr=[0.0005996104162082467, 0.0005996104162082467], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3750 loss: 2.9695 iter time (s): 62.444 samples/sec: 16.399 %comms: 0.0029156735500449915 %optimizer_step 0.057623841127387276 %forward: 23.26786360141458 %backward: 62.496372241224364 [2025-03-27 16:48:33,317] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18362.58 | forward: 145293.56 | backward_microstep: 390259.30 | backward: 390251.59 | backward_inner_microstep: 390235.75 | backward_inner: 390229.74 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.79 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.20 | step: 359.83 | _step_clipping: 0.14 | _step_step: 358.03 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.399 | iteration 3750/ 143000 | elapsed time per iteration (ms): 62444.5 | learning rate: 5.996E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.961715E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 16:58:57,621] [INFO] [logging.py:60:log_dist] [Rank 0] step=3760, skipped=0, lr=[0.0005996070512241224, 0.0005996070512241224], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3760 loss: 2.9666 iter time (s): 62.430 samples/sec: 16.402 %comms: 0.002892917508342107 %optimizer_step 0.05548336307853589 %forward: 23.277187959805264 %backward: 62.512352165054516 [2025-03-27 16:58:57,621] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18196.60 | forward: 145319.18 | backward_microstep: 390271.19 | backward: 390263.79 | backward_inner_microstep: 390247.93 | backward_inner: 390241.71 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.60 | reduce_tied_grads: 0.28 | comms: 18.06 | reduce_grads: 0.19 | step: 346.38 | _step_clipping: 0.11 | _step_step: 344.70 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.402 | iteration 3760/ 143000 | elapsed time per iteration (ms): 62430.4 | learning rate: 5.996E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.959595E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 17:09:24,439] [INFO] [logging.py:60:log_dist] [Rank 0] step=3770, skipped=0, lr=[0.0005996036717796184, 0.0005996036717796184], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3770 loss: 2.9564 iter time (s): 62.681 samples/sec: 16.337 %comms: 0.0029146732978899883 %optimizer_step 0.05774714538858756 %forward: 23.220957360336087 %backward: 62.278806919055405 [2025-03-27 17:09:24,440] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20346.04 | forward: 145551.88 | backward_microstep: 390379.84 | backward: 390371.40 | backward_inner_microstep: 390355.56 | backward_inner: 390349.45 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.59 | reduce_tied_grads: 0.31 | comms: 18.27 | reduce_grads: 0.19 | step: 361.97 | _step_clipping: 0.14 | _step_step: 360.19 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.336 | iteration 3770/ 143000 | elapsed time per iteration (ms): 62681.8 | learning rate: 5.996E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.960100E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 17:19:54,574] [INFO] [logging.py:60:log_dist] [Rank 0] step=3780, skipped=0, lr=[0.0005996002778748976, 0.0005996002778748976], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3780 loss: 2.9710 iter time (s): 63.013 samples/sec: 16.251 %comms: 0.002853398507385856 %optimizer_step 0.05629754422670258 %forward: 23.07443409005678 %backward: 61.957481812062795 [2025-03-27 17:19:54,574] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23799.69 | forward: 145398.76 | backward_microstep: 390419.64 | backward: 390412.22 | backward_inner_microstep: 390396.74 | backward_inner: 390390.75 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.55 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.19 | step: 354.75 | _step_clipping: 0.13 | _step_step: 352.96 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.250 | iteration 3780/ 143000 | elapsed time per iteration (ms): 63013.5 | learning rate: 5.996E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.955480E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 17:30:21,388] [INFO] [logging.py:60:log_dist] [Rank 0] step=3790, skipped=0, lr=[0.000599596869510124, 0.000599596869510124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3790 loss: 2.9469 iter time (s): 62.681 samples/sec: 16.337 %comms: 0.0028819041104014157 %optimizer_step 0.05663158767663782 %forward: 23.243968504210635 %backward: 62.30785301068813 [2025-03-27 17:30:21,389] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20036.55 | forward: 145695.19 | backward_microstep: 390559.52 | backward: 390550.97 | backward_inner_microstep: 390534.79 | backward_inner: 390526.85 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.78 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.20 | step: 354.97 | _step_clipping: 0.14 | _step_step: 353.12 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.337 | iteration 3790/ 143000 | elapsed time per iteration (ms): 62681.4 | learning rate: 5.996E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.951198E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 17:40:45,862] [INFO] [logging.py:60:log_dist] [Rank 0] step=3800, skipped=0, lr=[0.0005995934466854621, 0.0005995934466854621], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3800 loss: 2.9696 iter time (s): 62.447 samples/sec: 16.398 %comms: 0.0028729306477893782 %optimizer_step 0.05564073447577689 %forward: 23.296022655907876 %backward: 62.52369429965683 [2025-03-27 17:40:45,862] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18081.29 | forward: 145476.11 | backward_microstep: 390448.24 | backward: 390440.22 | backward_inner_microstep: 390424.77 | backward_inner: 390418.86 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.55 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.19 | step: 347.46 | _step_clipping: 0.13 | _step_step: 345.73 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.398 | iteration 3800/ 143000 | elapsed time per iteration (ms): 62447.3 | learning rate: 5.996E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.954548E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 17:51:11,735] [INFO] [logging.py:60:log_dist] [Rank 0] step=3810, skipped=0, lr=[0.000599590009401077, 0.000599590009401077], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3810 loss: 2.9497 iter time (s): 62.587 samples/sec: 16.361 %comms: 0.0028815495202479087 %optimizer_step 0.055979803065017326 %forward: 23.277964923538395 %backward: 62.37305564971911 [2025-03-27 17:51:11,736] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19359.44 | forward: 145689.34 | backward_microstep: 390380.97 | backward: 390373.00 | backward_inner_microstep: 390356.80 | backward_inner: 390350.68 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 18.03 | reduce_grads: 0.19 | step: 350.36 | _step_clipping: 0.12 | _step_step: 348.58 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.361 | iteration 3810/ 143000 | elapsed time per iteration (ms): 62587.4 | learning rate: 5.996E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.953983E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 18:01:32,322] [INFO] [logging.py:60:log_dist] [Rank 0] step=3820, skipped=0, lr=[0.0005995865576571346, 0.0005995865576571346], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3820 loss: 2.9561 iter time (s): 62.058 samples/sec: 16.501 %comms: 0.002932029975997525 %optimizer_step 0.0560209895204305 %forward: 23.42829377961454 %backward: 62.920183872602664 [2025-03-27 18:01:32,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14278.05 | forward: 145391.61 | backward_microstep: 390481.29 | backward: 390470.88 | backward_inner_microstep: 390453.42 | backward_inner: 390447.20 | backward_allreduce_microstep: 9.24 | backward_allreduce: 2.57 | reduce_tied_grads: 0.31 | comms: 18.20 | reduce_grads: 0.19 | step: 347.66 | _step_clipping: 0.13 | _step_step: 345.92 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.501 | iteration 3820/ 143000 | elapsed time per iteration (ms): 62058.7 | learning rate: 5.996E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.950819E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 18:11:52,208] [INFO] [logging.py:60:log_dist] [Rank 0] step=3830, skipped=0, lr=[0.0005995830914538016, 0.0005995830914538016], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3830 loss: 2.9487 iter time (s): 61.988 samples/sec: 16.519 %comms: 0.0028985785823253925 %optimizer_step 0.05561450055947928 %forward: 23.44410877176403 %backward: 62.965803812826 [2025-03-27 18:11:52,208] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13813.49 | forward: 145325.28 | backward_microstep: 390320.98 | backward: 390312.25 | backward_inner_microstep: 390296.83 | backward_inner: 390290.82 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.53 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 344.74 | _step_clipping: 0.12 | _step_step: 342.88 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.519 | iteration 3830/ 143000 | elapsed time per iteration (ms): 61988.5 | learning rate: 5.996E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.948750E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 18:22:16,930] [INFO] [logging.py:60:log_dist] [Rank 0] step=3840, skipped=0, lr=[0.0005995796107912452, 0.0005995796107912452], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3840 loss: 2.9490 iter time (s): 62.472 samples/sec: 16.391 %comms: 0.002880184195496126 %optimizer_step 0.05677979277179284 %forward: 23.25430794807598 %backward: 62.47696156049872 [2025-03-27 18:22:16,930] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18765.33 | forward: 145273.40 | backward_microstep: 390311.24 | backward: 390303.61 | backward_inner_microstep: 390288.20 | backward_inner: 390282.26 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.51 | reduce_tied_grads: 0.34 | comms: 17.99 | reduce_grads: 0.19 | step: 354.71 | _step_clipping: 0.14 | _step_step: 352.96 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.391 | iteration 3840/ 143000 | elapsed time per iteration (ms): 62472.2 | learning rate: 5.996E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.953984E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 18:32:41,654] [INFO] [logging.py:60:log_dist] [Rank 0] step=3850, skipped=0, lr=[0.0005995761156696335, 0.0005995761156696335], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3850 loss: 2.9336 iter time (s): 62.472 samples/sec: 16.391 %comms: 0.0028685348420999456 %optimizer_step 0.05601861599921833 %forward: 23.243364007317076 %backward: 62.46584053791876 [2025-03-27 18:32:41,654] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18924.19 | forward: 145205.50 | backward_microstep: 390241.87 | backward: 390235.40 | backward_inner_microstep: 390220.22 | backward_inner: 390214.47 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 349.96 | _step_clipping: 0.12 | _step_step: 348.25 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.391 | iteration 3850/ 143000 | elapsed time per iteration (ms): 62472.4 | learning rate: 5.996E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.953112E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 18:43:07,484] [INFO] [logging.py:60:log_dist] [Rank 0] step=3860, skipped=0, lr=[0.000599572606089135, 0.000599572606089135], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3860 loss: 2.9462 iter time (s): 62.582 samples/sec: 16.362 %comms: 0.002898475905959966 %optimizer_step 0.05600410748345796 %forward: 23.249479349522073 %backward: 62.37393847515348 [2025-03-27 18:43:07,484] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19581.15 | forward: 145500.86 | backward_microstep: 390358.73 | backward: 390351.17 | backward_inner_microstep: 390335.68 | backward_inner: 390329.74 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 18.14 | reduce_grads: 0.18 | step: 350.49 | _step_clipping: 0.16 | _step_step: 348.63 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.362 | iteration 3860/ 143000 | elapsed time per iteration (ms): 62583.0 | learning rate: 5.996E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.941455E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 18:53:26,962] [INFO] [logging.py:60:log_dist] [Rank 0] step=3870, skipped=0, lr=[0.0005995690820499192, 0.0005995690820499192], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3870 loss: 2.9781 iter time (s): 61.947 samples/sec: 16.530 %comms: 0.0029017923256910005 %optimizer_step 0.05687949404916159 %forward: 23.45595846264022 %backward: 63.016746215184874 [2025-03-27 18:53:26,963] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13401.62 | forward: 145303.23 | backward_microstep: 390378.51 | backward: 390371.45 | backward_inner_microstep: 390356.36 | backward_inner: 390350.57 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.18 | step: 352.35 | _step_clipping: 0.11 | _step_step: 350.66 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.530 | iteration 3870/ 143000 | elapsed time per iteration (ms): 61947.8 | learning rate: 5.996E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.961830E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 19:03:52,630] [INFO] [logging.py:60:log_dist] [Rank 0] step=3880, skipped=0, lr=[0.0005995655435521562, 0.0005995655435521562], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3880 loss: 2.9462 iter time (s): 62.566 samples/sec: 16.367 %comms: 0.002884401182412656 %optimizer_step 0.05626218983214947 %forward: 23.232368949823716 %backward: 62.380785124650494 [2025-03-27 19:03:52,631] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19616.83 | forward: 145356.23 | backward_microstep: 390302.02 | backward: 390293.20 | backward_inner_microstep: 390277.53 | backward_inner: 390271.49 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.57 | reduce_tied_grads: 0.29 | comms: 18.05 | reduce_grads: 0.19 | step: 352.01 | _step_clipping: 0.13 | _step_step: 350.27 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.366 | iteration 3880/ 143000 | elapsed time per iteration (ms): 62566.8 | learning rate: 5.996E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.964959E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 19:14:23,967] [INFO] [logging.py:60:log_dist] [Rank 0] step=3890, skipped=0, lr=[0.0005995619905960169, 0.0005995619905960169], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3890 loss: 2.9375 iter time (s): 63.133 samples/sec: 16.220 %comms: 0.0028671154568988095 %optimizer_step 0.055751443557473836 %forward: 23.03518677631442 %backward: 61.808874150097125 [2025-03-27 19:14:23,967] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25364.00 | forward: 145428.18 | backward_microstep: 390225.47 | backward: 390218.33 | backward_inner_microstep: 390202.83 | backward_inner: 390196.98 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.57 | reduce_tied_grads: 0.28 | comms: 18.10 | reduce_grads: 0.18 | step: 351.98 | _step_clipping: 0.12 | _step_step: 350.36 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.220 | iteration 3890/ 143000 | elapsed time per iteration (ms): 63133.6 | learning rate: 5.996E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.944128E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 19:24:48,221] [INFO] [logging.py:60:log_dist] [Rank 0] step=3900, skipped=0, lr=[0.0005995584231816726, 0.0005995584231816726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3900 loss: 2.9543 iter time (s): 62.425 samples/sec: 16.404 %comms: 0.002911101518122427 %optimizer_step 0.05646359075924372 %forward: 23.267075034235337 %backward: 62.501628287726454 [2025-03-27 19:24:48,221] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18549.87 | forward: 145244.33 | backward_microstep: 390172.41 | backward: 390165.37 | backward_inner_microstep: 390150.24 | backward_inner: 390144.44 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.51 | reduce_tied_grads: 0.33 | comms: 18.17 | reduce_grads: 0.20 | step: 352.47 | _step_clipping: 0.13 | _step_step: 350.69 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.404 | iteration 3900/ 143000 | elapsed time per iteration (ms): 62425.4 | learning rate: 5.996E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.943584E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 19:35:11,003] [INFO] [logging.py:60:log_dist] [Rank 0] step=3910, skipped=0, lr=[0.0005995548413092955, 0.0005995548413092955], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3910 loss: 2.9257 iter time (s): 62.278 samples/sec: 16.442 %comms: 0.0028734172221373056 %optimizer_step 0.05546847178240488 %forward: 23.320793180927545 %backward: 62.64510117164909 [2025-03-27 19:35:11,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17111.46 | forward: 145236.55 | backward_microstep: 390146.36 | backward: 390139.32 | backward_inner_microstep: 390123.96 | backward_inner: 390118.10 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 345.44 | _step_clipping: 0.12 | _step_step: 343.80 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.442 | iteration 3910/ 143000 | elapsed time per iteration (ms): 62278.3 | learning rate: 5.996E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.936329E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 19:45:39,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=3920, skipped=0, lr=[0.0005995512449790586, 0.0005995512449790586], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3920 loss: 2.9477 iter time (s): 62.819 samples/sec: 16.301 %comms: 0.0028536233637483682 %optimizer_step 0.05542750073999394 %forward: 23.128917307126947 %backward: 62.116726032859404 [2025-03-27 19:45:39,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22404.89 | forward: 145293.87 | backward_microstep: 390219.02 | backward: 390211.92 | backward_inner_microstep: 390195.00 | backward_inner: 390188.91 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.51 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 348.19 | _step_clipping: 0.13 | _step_step: 346.48 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.301 | iteration 3920/ 143000 | elapsed time per iteration (ms): 62819.7 | learning rate: 5.996E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.954490E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 19:56:01,529] [INFO] [logging.py:60:log_dist] [Rank 0] step=3930, skipped=0, lr=[0.0005995476341911352, 0.0005995476341911352], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3930 loss: 2.9587 iter time (s): 62.232 samples/sec: 16.454 %comms: 0.002867275010181054 %optimizer_step 0.055918950241312465 %forward: 23.32487588091074 %backward: 62.68842015549424 [2025-03-27 19:56:01,529] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16784.46 | forward: 145156.16 | backward_microstep: 390131.15 | backward: 390124.71 | backward_inner_microstep: 390109.91 | backward_inner: 390104.39 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.51 | reduce_tied_grads: 0.24 | comms: 17.84 | reduce_grads: 0.17 | step: 348.00 | _step_clipping: 0.11 | _step_step: 346.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.454 | iteration 3930/ 143000 | elapsed time per iteration (ms): 62232.9 | learning rate: 5.995E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.958904E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 20:06:28,491] [INFO] [logging.py:60:log_dist] [Rank 0] step=3940, skipped=0, lr=[0.0005995440089456999, 0.0005995440089456999], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3940 loss: 2.9350 iter time (s): 62.696 samples/sec: 16.333 %comms: 0.0028718318282418115 %optimizer_step 0.05516705318617486 %forward: 23.16456112719164 %backward: 62.236260660497265 [2025-03-27 20:06:28,491] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21278.12 | forward: 145231.69 | backward_microstep: 390200.30 | backward: 390194.20 | backward_inner_microstep: 390179.11 | backward_inner: 390173.41 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 18.01 | reduce_grads: 0.17 | step: 345.87 | _step_clipping: 0.14 | _step_step: 344.15 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.333 | iteration 3940/ 143000 | elapsed time per iteration (ms): 62696.2 | learning rate: 5.995E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.947628E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 20:16:52,366] [INFO] [logging.py:60:log_dist] [Rank 0] step=3950, skipped=0, lr=[0.0005995403692429275, 0.0005995403692429275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3950 loss: 2.9651 iter time (s): 62.387 samples/sec: 16.414 %comms: 0.002925061531319599 %optimizer_step 0.05558897149398787 %forward: 23.30232322637607 %backward: 62.56982566424231 [2025-03-27 20:16:52,366] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17856.82 | forward: 145376.02 | backward_microstep: 390362.38 | backward: 390353.87 | backward_inner_microstep: 390336.56 | backward_inner: 390330.43 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.55 | reduce_tied_grads: 0.32 | comms: 18.25 | reduce_grads: 0.18 | step: 346.80 | _step_clipping: 0.14 | _step_step: 344.97 | _step_zero_grad: 0.49 | _step_check_overflow: 0.63 samples/sec: 16.414 | iteration 3950/ 143000 | elapsed time per iteration (ms): 62387.5 | learning rate: 5.995E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.943403E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 20:27:20,805] [INFO] [logging.py:60:log_dist] [Rank 0] step=3960, skipped=0, lr=[0.0005995367150829937, 0.0005995367150829937], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3960 loss: 2.9451 iter time (s): 62.843 samples/sec: 16.294 %comms: 0.0028514241332155492 %optimizer_step 0.054948766753580264 %forward: 23.128427276443347 %backward: 62.10331025700773 [2025-03-27 20:27:20,805] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22555.63 | forward: 145346.76 | backward_microstep: 390284.78 | backward: 390277.95 | backward_inner_microstep: 390262.56 | backward_inner: 390256.69 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.18 | step: 345.32 | _step_clipping: 0.13 | _step_step: 343.64 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.294 | iteration 3960/ 143000 | elapsed time per iteration (ms): 62843.9 | learning rate: 5.995E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.939686E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 20:37:43,082] [INFO] [logging.py:60:log_dist] [Rank 0] step=3970, skipped=0, lr=[0.000599533046466075, 0.000599533046466075], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3970 loss: 2.9300 iter time (s): 62.227 samples/sec: 16.456 %comms: 0.0028779739346718753 %optimizer_step 0.05598832676454062 %forward: 23.32864399380474 %backward: 62.70667126323593 [2025-03-27 20:37:43,082] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16712.63 | forward: 145167.50 | backward_microstep: 390211.82 | backward: 390205.74 | backward_inner_microstep: 390189.50 | backward_inner: 390184.02 | backward_allreduce_microstep: 8.72 | backward_allreduce: 2.43 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 348.40 | _step_clipping: 0.14 | _step_step: 346.67 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.456 | iteration 3970/ 143000 | elapsed time per iteration (ms): 62227.7 | learning rate: 5.995E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.933330E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 20:48:10,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=3980, skipped=0, lr=[0.0005995293633923481, 0.0005995293633923481], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3980 loss: 2.9433 iter time (s): 62.790 samples/sec: 16.308 %comms: 0.0028570934578956418 %optimizer_step 0.05510259610956688 %forward: 23.13790583723642 %backward: 62.143653808511544 [2025-03-27 20:48:10,984] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22216.50 | forward: 145281.92 | backward_microstep: 390204.09 | backward: 390197.34 | backward_inner_microstep: 390178.74 | backward_inner: 390173.13 | backward_allreduce_microstep: 10.90 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.20 | step: 345.99 | _step_clipping: 0.12 | _step_step: 344.31 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.308 | iteration 3980/ 143000 | elapsed time per iteration (ms): 62790.1 | learning rate: 5.995E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.936336E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 20:58:34,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=3990, skipped=0, lr=[0.0005995256658619911, 0.0005995256658619911], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 3990 loss: 2.9284 iter time (s): 62.345 samples/sec: 16.425 %comms: 0.002894170908160922 %optimizer_step 0.05645711707890828 %forward: 23.30055991573965 %backward: 62.58613353068687 [2025-03-27 20:58:34,441] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17805.75 | forward: 145267.73 | backward_microstep: 390201.00 | backward: 390194.29 | backward_inner_microstep: 390179.35 | backward_inner: 390171.90 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.18 | step: 351.98 | _step_clipping: 0.18 | _step_step: 350.18 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.425 | iteration 3990/ 143000 | elapsed time per iteration (ms): 62345.7 | learning rate: 5.995E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.931398E+00 | loss scale: 32768.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 21:09:03,700] [INFO] [logging.py:60:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.0005995219538751823, 0.0005995219538751823], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4000 loss: 2.9369 iter time (s): 62.925 samples/sec: 16.273 %comms: 0.002863352554754972 %optimizer_step 0.0561252261785947 %forward: 23.09409021961805 %backward: 62.02199992592233 [2025-03-27 21:09:03,701] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23454.40 | forward: 145320.55 | backward_microstep: 390284.48 | backward: 390276.10 | backward_inner_microstep: 390260.54 | backward_inner: 390254.21 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.53 | reduce_tied_grads: 0.34 | comms: 18.02 | reduce_grads: 0.18 | step: 353.17 | _step_clipping: 0.13 | _step_step: 351.37 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.273 | iteration 4000/ 143000 | elapsed time per iteration (ms): 62926.0 | learning rate: 5.995E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.934622E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 21:09:06,553] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step4000/mp_rank_00_model_states.pt [2025-03-27 21:09:19,675] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-27 21:09:19,682] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-27 21:19:47,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=4010, skipped=0, lr=[0.000599518227432101, 0.000599518227432101], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4010 loss: 2.9354 iter time (s): 62.814 samples/sec: 16.302 %comms: 0.0028458285891264446 %optimizer_step 0.054854535027771135 %forward: 23.13351248532245 %backward: 62.13056427833734 [2025-03-27 21:19:47,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22369.53 | forward: 145311.84 | backward_microstep: 390276.59 | backward: 390269.59 | backward_inner_microstep: 390254.13 | backward_inner: 390248.10 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.18 | step: 344.57 | _step_clipping: 0.12 | _step_step: 342.91 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 15.897 | iteration 4010/ 143000 | elapsed time per iteration (ms): 64414.3 | learning rate: 5.995E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.941155E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 21:30:09,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=4020, skipped=0, lr=[0.0005995144865329269, 0.0005995144865329269], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4020 loss: 2.9294 iter time (s): 62.192 samples/sec: 16.465 %comms: 0.0028738188753549276 %optimizer_step 0.05537127182859301 %forward: 23.358170914710787 %backward: 62.73888642402979 [2025-03-27 21:30:09,768] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16271.89 | forward: 145268.78 | backward_microstep: 390190.81 | backward: 390184.72 | backward_inner_microstep: 390169.70 | backward_inner: 390164.07 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.53 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 344.36 | _step_clipping: 0.13 | _step_step: 342.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.465 | iteration 4020/ 143000 | elapsed time per iteration (ms): 62192.4 | learning rate: 5.995E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.931541E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 21:40:37,793] [INFO] [logging.py:60:log_dist] [Rank 0] step=4030, skipped=0, lr=[0.0005995107311778406, 0.0005995107311778406], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4030 loss: 2.9504 iter time (s): 62.802 samples/sec: 16.305 %comms: 0.0028976421803918605 %optimizer_step 0.05573740602059904 %forward: 23.133929606595675 %backward: 62.133947280516 [2025-03-27 21:40:37,793] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22350.84 | forward: 145285.72 | backward_microstep: 390221.27 | backward: 390213.67 | backward_inner_microstep: 390198.44 | backward_inner: 390192.56 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 18.20 | reduce_grads: 0.18 | step: 350.04 | _step_clipping: 0.12 | _step_step: 348.35 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.305 | iteration 4030/ 143000 | elapsed time per iteration (ms): 62802.6 | learning rate: 5.995E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.922438E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 21:51:04,641] [INFO] [logging.py:60:log_dist] [Rank 0] step=4040, skipped=0, lr=[0.0005995069613670235, 0.0005995069613670235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4040 loss: 2.9268 iter time (s): 62.684 samples/sec: 16.336 %comms: 0.0028497987828531688 %optimizer_step 0.05561481633717779 %forward: 23.1614700546939 %backward: 62.238622635157945 [2025-03-27 21:51:04,642] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21379.30 | forward: 145185.94 | backward_microstep: 390144.07 | backward: 390138.15 | backward_inner_microstep: 390123.38 | backward_inner: 390117.81 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 348.62 | _step_clipping: 0.14 | _step_step: 347.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.336 | iteration 4040/ 143000 | elapsed time per iteration (ms): 62684.8 | learning rate: 5.995E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.934045E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 22:01:27,701] [INFO] [logging.py:60:log_dist] [Rank 0] step=4050, skipped=0, lr=[0.0005995031771006572, 0.0005995031771006572], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4050 loss: 2.9199 iter time (s): 62.305 samples/sec: 16.435 %comms: 0.0028851875827945956 %optimizer_step 0.055867973663146914 %forward: 23.311517399819817 %backward: 62.649449420164046 [2025-03-27 22:01:27,701] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17306.99 | forward: 145243.40 | backward_microstep: 390347.36 | backward: 390340.06 | backward_inner_microstep: 390325.06 | backward_inner: 390313.75 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.18 | step: 348.09 | _step_clipping: 0.12 | _step_step: 346.41 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.435 | iteration 4050/ 143000 | elapsed time per iteration (ms): 62306.0 | learning rate: 5.995E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.917266E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 22:11:55,126] [INFO] [logging.py:60:log_dist] [Rank 0] step=4060, skipped=0, lr=[0.0005994993783789247, 0.0005994993783789247], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4060 loss: 2.9043 iter time (s): 62.742 samples/sec: 16.321 %comms: 0.0028500654961406153 %optimizer_step 0.055004721281199806 %forward: 23.16942657154176 %backward: 62.21210569852869 [2025-03-27 22:11:55,126] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21567.19 | forward: 145369.53 | backward_microstep: 390338.09 | backward: 390330.96 | backward_inner_microstep: 390315.65 | backward_inner: 390309.81 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.55 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 345.11 | _step_clipping: 0.12 | _step_step: 343.45 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.321 | iteration 4060/ 143000 | elapsed time per iteration (ms): 62742.5 | learning rate: 5.995E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.920753E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 22:22:22,974] [INFO] [logging.py:60:log_dist] [Rank 0] step=4070, skipped=0, lr=[0.0005994955652020091, 0.0005994955652020091], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4070 loss: 2.9239 iter time (s): 62.784 samples/sec: 16.310 %comms: 0.0028537298148808114 %optimizer_step 0.05574424304468246 %forward: 23.12815142925045 %backward: 62.14698341820789 [2025-03-27 22:22:22,974] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22324.89 | forward: 145208.28 | backward_microstep: 390191.22 | backward: 390184.94 | backward_inner_microstep: 390169.83 | backward_inner: 390163.89 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.18 | step: 349.99 | _step_clipping: 0.13 | _step_step: 348.22 | _step_zero_grad: 0.49 | _step_check_overflow: 0.60 samples/sec: 16.310 | iteration 4070/ 143000 | elapsed time per iteration (ms): 62784.8 | learning rate: 5.995E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.919123E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 22:32:51,577] [INFO] [logging.py:60:log_dist] [Rank 0] step=4080, skipped=0, lr=[0.0005994917375700947, 0.0005994917375700947], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4080 loss: 2.9201 iter time (s): 62.860 samples/sec: 16.290 %comms: 0.0028834106296899867 %optimizer_step 0.0557301352430868 %forward: 23.11419293451778 %backward: 62.09316379622231 [2025-03-27 22:32:51,578] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22838.88 | forward: 145295.32 | backward_microstep: 390324.19 | backward: 390316.29 | backward_inner_microstep: 390299.13 | backward_inner: 390291.09 | backward_allreduce_microstep: 9.21 | backward_allreduce: 2.70 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.18 | step: 350.32 | _step_clipping: 0.14 | _step_step: 348.60 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.290 | iteration 4080/ 143000 | elapsed time per iteration (ms): 62860.3 | learning rate: 5.995E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.919367E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 22:43:18,375] [INFO] [logging.py:60:log_dist] [Rank 0] step=4090, skipped=0, lr=[0.0005994878954833661, 0.0005994878954833661], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4090 loss: 2.9272 iter time (s): 62.679 samples/sec: 16.337 %comms: 0.002860219688379472 %optimizer_step 0.05557846496236854 %forward: 23.17402194699508 %backward: 62.266336961195044 [2025-03-27 22:43:18,376] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21129.42 | forward: 145253.07 | backward_microstep: 390287.47 | backward: 390280.82 | backward_inner_microstep: 390265.63 | backward_inner: 390258.17 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 348.36 | _step_clipping: 0.12 | _step_step: 346.71 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.337 | iteration 4090/ 143000 | elapsed time per iteration (ms): 62679.8 | learning rate: 5.995E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.916788E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 22:53:45,288] [INFO] [logging.py:60:log_dist] [Rank 0] step=4100, skipped=0, lr=[0.0005994840389420085, 0.0005994840389420085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4100 loss: 2.9183 iter time (s): 62.691 samples/sec: 16.334 %comms: 0.0028953009239762466 %optimizer_step 0.059030063062593924 %forward: 23.177782036803173 %backward: 62.261847736530584 [2025-03-27 22:53:45,288] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21098.18 | forward: 145302.84 | backward_microstep: 390330.97 | backward: 390323.07 | backward_inner_microstep: 390307.94 | backward_inner: 390302.07 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.48 | reduce_tied_grads: 0.31 | comms: 18.15 | reduce_grads: 0.19 | step: 370.06 | _step_clipping: 0.13 | _step_step: 368.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.63 samples/sec: 16.334 | iteration 4100/ 143000 | elapsed time per iteration (ms): 62691.2 | learning rate: 5.995E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.914353E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 23:04:08,037] [INFO] [logging.py:60:log_dist] [Rank 0] step=4110, skipped=0, lr=[0.0005994801679462084, 0.0005994801679462084], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4110 loss: 2.9151 iter time (s): 62.274 samples/sec: 16.443 %comms: 0.002871770777031478 %optimizer_step 0.055816986450408904 %forward: 23.309309982387934 %backward: 62.645096936356495 [2025-03-27 23:04:08,037] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17383.75 | forward: 145157.31 | backward_microstep: 390124.58 | backward: 390118.53 | backward_inner_microstep: 390104.07 | backward_inner: 390094.85 | backward_allreduce_microstep: 6.98 | backward_allreduce: 2.37 | reduce_tied_grads: 0.25 | comms: 17.88 | reduce_grads: 0.17 | step: 347.60 | _step_clipping: 0.13 | _step_step: 345.96 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.443 | iteration 4110/ 143000 | elapsed time per iteration (ms): 62274.9 | learning rate: 5.995E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.909122E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 23:14:31,598] [INFO] [logging.py:60:log_dist] [Rank 0] step=4120, skipped=0, lr=[0.0005994762824961525, 0.0005994762824961525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4120 loss: 2.9118 iter time (s): 62.356 samples/sec: 16.422 %comms: 0.0028636338501163328 %optimizer_step 0.05530623479775382 %forward: 23.30767460050658 %backward: 62.57861983522195 [2025-03-27 23:14:31,599] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17915.04 | forward: 145336.39 | backward_microstep: 390219.18 | backward: 390212.71 | backward_inner_microstep: 390197.79 | backward_inner: 390192.16 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.18 | step: 344.87 | _step_clipping: 0.12 | _step_step: 343.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.422 | iteration 4120/ 143000 | elapsed time per iteration (ms): 62356.1 | learning rate: 5.995E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.907039E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 23:24:58,382] [INFO] [logging.py:60:log_dist] [Rank 0] step=4130, skipped=0, lr=[0.0005994723825920282, 0.0005994723825920282], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4130 loss: 2.9133 iter time (s): 62.678 samples/sec: 16.338 %comms: 0.002854425386356195 %optimizer_step 0.05538049321888608 %forward: 23.17103566357801 %backward: 62.25129456084305 [2025-03-27 23:24:58,383] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21284.02 | forward: 145231.11 | backward_microstep: 390184.13 | backward: 390177.85 | backward_inner_microstep: 390163.01 | backward_inner: 390157.41 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.49 | reduce_tied_grads: 0.46 | comms: 17.89 | reduce_grads: 0.17 | step: 347.11 | _step_clipping: 0.12 | _step_step: 345.43 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.337 | iteration 4130/ 143000 | elapsed time per iteration (ms): 62678.4 | learning rate: 5.995E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.907835E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 23:35:21,541] [INFO] [logging.py:60:log_dist] [Rank 0] step=4140, skipped=0, lr=[0.0005994684682340241, 0.0005994684682340241], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4140 loss: 2.9099 iter time (s): 62.315 samples/sec: 16.433 %comms: 0.0028868333791128515 %optimizer_step 0.05640441538427831 %forward: 23.310167750075813 %backward: 62.61643221738067 [2025-03-27 23:35:21,542] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17591.07 | forward: 145258.08 | backward_microstep: 390203.19 | backward: 390196.35 | backward_inner_microstep: 390181.45 | backward_inner: 390175.72 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 17.99 | reduce_grads: 0.18 | step: 351.49 | _step_clipping: 0.13 | _step_step: 349.77 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.432 | iteration 4140/ 143000 | elapsed time per iteration (ms): 62315.9 | learning rate: 5.995E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.916138E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 23:45:49,088] [INFO] [logging.py:60:log_dist] [Rank 0] step=4150, skipped=0, lr=[0.0005994645394223286, 0.0005994645394223286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4150 loss: 2.9064 iter time (s): 62.754 samples/sec: 16.318 %comms: 0.002875385098789865 %optimizer_step 0.056455918073535255 %forward: 23.15356859797811 %backward: 62.20561445157252 [2025-03-27 23:45:49,089] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21777.20 | forward: 145298.24 | backward_microstep: 390374.95 | backward: 390366.01 | backward_inner_microstep: 390350.80 | backward_inner: 390344.93 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.53 | reduce_tied_grads: 0.34 | comms: 18.04 | reduce_grads: 0.19 | step: 354.28 | _step_clipping: 0.13 | _step_step: 352.53 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.317 | iteration 4150/ 143000 | elapsed time per iteration (ms): 62754.7 | learning rate: 5.995E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.914796E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-27 23:56:13,289] [INFO] [logging.py:60:log_dist] [Rank 0] step=4160, skipped=0, lr=[0.0005994605961571317, 0.0005994605961571317], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4160 loss: 2.8925 iter time (s): 62.420 samples/sec: 16.405 %comms: 0.002886674301627039 %optimizer_step 0.05602489569366707 %forward: 23.30839322747073 %backward: 62.53791460488939 [2025-03-27 23:56:13,290] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18270.22 | forward: 145489.89 | backward_microstep: 390366.52 | backward: 390358.70 | backward_inner_microstep: 390343.57 | backward_inner: 390337.74 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.18 | step: 349.70 | _step_clipping: 0.14 | _step_step: 347.94 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.405 | iteration 4160/ 143000 | elapsed time per iteration (ms): 62420.1 | learning rate: 5.995E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.900935E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 00:06:40,749] [INFO] [logging.py:60:log_dist] [Rank 0] step=4170, skipped=0, lr=[0.0005994566384386235, 0.0005994566384386235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4170 loss: 2.9089 iter time (s): 62.745 samples/sec: 16.320 %comms: 0.0028676914845835073 %optimizer_step 0.05566273906597267 %forward: 23.172680919555145 %backward: 62.19474898459973 [2025-03-28 00:06:40,749] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21727.89 | forward: 145397.96 | backward_microstep: 390250.84 | backward: 390243.55 | backward_inner_microstep: 390228.35 | backward_inner: 390222.59 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.55 | reduce_tied_grads: 0.26 | comms: 17.99 | reduce_grads: 0.18 | step: 349.26 | _step_clipping: 0.12 | _step_step: 347.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.320 | iteration 4170/ 143000 | elapsed time per iteration (ms): 62746.0 | learning rate: 5.995E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.907590E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 00:17:03,882] [INFO] [logging.py:60:log_dist] [Rank 0] step=4180, skipped=0, lr=[0.0005994526662669952, 0.0005994526662669952], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4180 loss: 2.9104 iter time (s): 62.313 samples/sec: 16.433 %comms: 0.0028768122307675816 %optimizer_step 0.05558926548071793 %forward: 23.33173909159877 %backward: 62.655528612982856 [2025-03-28 00:17:03,883] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17176.61 | forward: 145386.55 | backward_microstep: 390432.16 | backward: 390424.00 | backward_inner_microstep: 390408.64 | backward_inner: 390402.67 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 346.39 | _step_clipping: 0.13 | _step_step: 344.69 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.433 | iteration 4180/ 143000 | elapsed time per iteration (ms): 62313.3 | learning rate: 5.995E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.900984E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 00:27:30,762] [INFO] [logging.py:60:log_dist] [Rank 0] step=4190, skipped=0, lr=[0.0005994486796424384, 0.0005994486796424384], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4190 loss: 2.8940 iter time (s): 62.687 samples/sec: 16.335 %comms: 0.002847979350169719 %optimizer_step 0.054988722310600445 %forward: 23.176008312518526 %backward: 62.24989789802724 [2025-03-28 00:27:30,763] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21280.79 | forward: 145284.52 | backward_microstep: 390237.12 | backward: 390228.82 | backward_inner_microstep: 390211.73 | backward_inner: 390206.00 | backward_allreduce_microstep: 9.30 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.85 | reduce_grads: 0.18 | step: 344.71 | _step_clipping: 0.13 | _step_step: 343.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.335 | iteration 4190/ 143000 | elapsed time per iteration (ms): 62688.0 | learning rate: 5.994E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.896930E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 00:37:52,557] [INFO] [logging.py:60:log_dist] [Rank 0] step=4200, skipped=0, lr=[0.0005994446785651456, 0.0005994446785651456], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4200 loss: 2.8903 iter time (s): 62.179 samples/sec: 16.469 %comms: 0.002879050504703497 %optimizer_step 0.05712513966122157 %forward: 23.3532441835303 %backward: 62.756691475590124 [2025-03-28 00:37:52,558] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16273.86 | forward: 145208.23 | backward_microstep: 390221.24 | backward: 390215.07 | backward_inner_microstep: 390198.01 | backward_inner: 390192.34 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.46 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.18 | step: 355.20 | _step_clipping: 0.12 | _step_step: 353.17 | _step_zero_grad: 0.48 | _step_check_overflow: 0.88 samples/sec: 16.468 | iteration 4200/ 143000 | elapsed time per iteration (ms): 62179.6 | learning rate: 5.994E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.900753E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 00:48:22,101] [INFO] [logging.py:60:log_dist] [Rank 0] step=4210, skipped=0, lr=[0.0005994406630353098, 0.0005994406630353098], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4210 loss: 2.8941 iter time (s): 62.954 samples/sec: 16.266 %comms: 0.002872481740511412 %optimizer_step 0.05588514022638806 %forward: 23.078205756807435 %backward: 61.99190885363475 [2025-03-28 00:48:22,101] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23893.29 | forward: 145285.83 | backward_microstep: 390268.99 | backward: 390261.97 | backward_inner_microstep: 390246.64 | backward_inner: 390240.82 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.18 | step: 351.82 | _step_clipping: 0.15 | _step_step: 350.10 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.266 | iteration 4210/ 143000 | elapsed time per iteration (ms): 62954.3 | learning rate: 5.994E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.902674E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 00:58:49,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=4220, skipped=0, lr=[0.000599436633053125, 0.000599436633053125], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4220 loss: 2.9177 iter time (s): 62.774 samples/sec: 16.313 %comms: 0.0028760803188155296 %optimizer_step 0.0557480646586822 %forward: 23.149762933805167 %backward: 62.17127539474665 [2025-03-28 00:58:49,845] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22056.22 | forward: 145319.84 | backward_microstep: 390280.29 | backward: 390272.67 | backward_inner_microstep: 390255.53 | backward_inner: 390249.61 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.53 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.19 | step: 349.95 | _step_clipping: 0.13 | _step_step: 348.24 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.312 | iteration 4220/ 143000 | elapsed time per iteration (ms): 62774.3 | learning rate: 5.994E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.913098E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 01:09:19,064] [INFO] [logging.py:60:log_dist] [Rank 0] step=4230, skipped=0, lr=[0.0005994325886187854, 0.0005994325886187854], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4230 loss: 2.8966 iter time (s): 62.921 samples/sec: 16.274 %comms: 0.002860349454297667 %optimizer_step 0.0557178552692686 %forward: 23.091065965765758 %backward: 62.017142799370305 [2025-03-28 01:09:19,065] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23611.01 | forward: 145292.40 | backward_microstep: 390229.37 | backward: 390221.03 | backward_inner_microstep: 390204.06 | backward_inner: 390198.25 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.52 | reduce_tied_grads: 0.34 | comms: 18.00 | reduce_grads: 0.22 | step: 350.58 | _step_clipping: 0.13 | _step_step: 348.90 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.274 | iteration 4230/ 143000 | elapsed time per iteration (ms): 62922.0 | learning rate: 5.994E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.892644E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 01:19:43,056] [INFO] [logging.py:60:log_dist] [Rank 0] step=4240, skipped=0, lr=[0.0005994285297324865, 0.0005994285297324865], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4240 loss: 2.9125 iter time (s): 62.399 samples/sec: 16.411 %comms: 0.002880075080867831 %optimizer_step 0.05594437999859709 %forward: 23.302498841777254 %backward: 62.56456905897976 [2025-03-28 01:19:43,057] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18050.06 | forward: 145404.43 | backward_microstep: 390404.24 | backward: 390394.41 | backward_inner_microstep: 390378.88 | backward_inner: 390372.89 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.55 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.18 | step: 349.09 | _step_clipping: 0.14 | _step_step: 347.33 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.410 | iteration 4240/ 143000 | elapsed time per iteration (ms): 62399.2 | learning rate: 5.994E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.898615E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 01:30:06,492] [INFO] [logging.py:60:log_dist] [Rank 0] step=4250, skipped=0, lr=[0.000599424456394424, 0.000599424456394424], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4250 loss: 2.8861 iter time (s): 62.343 samples/sec: 16.425 %comms: 0.0029053245876834726 %optimizer_step 0.05687307687145322 %forward: 23.319193555428196 %backward: 62.62631645818564 [2025-03-28 01:30:06,493] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17497.02 | forward: 145378.80 | backward_microstep: 390439.74 | backward: 390431.12 | backward_inner_microstep: 390415.73 | backward_inner: 390409.65 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.50 | reduce_tied_grads: 0.35 | comms: 18.11 | reduce_grads: 0.20 | step: 354.56 | _step_clipping: 0.16 | _step_step: 351.00 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.425 | iteration 4250/ 143000 | elapsed time per iteration (ms): 62343.6 | learning rate: 5.994E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.888419E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 01:40:32,242] [INFO] [logging.py:60:log_dist] [Rank 0] step=4260, skipped=0, lr=[0.0005994203686047947, 0.0005994203686047947], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4260 loss: 2.8860 iter time (s): 62.574 samples/sec: 16.365 %comms: 0.0028756804380136586 %optimizer_step 0.05563024648914649 %forward: 23.246153433642437 %backward: 62.38063914893838 [2025-03-28 01:40:32,243] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19840.35 | forward: 145461.46 | backward_microstep: 390350.81 | backward: 390343.24 | backward_inner_microstep: 390325.33 | backward_inner: 390319.33 | backward_allreduce_microstep: 9.75 | backward_allreduce: 4.74 | reduce_tied_grads: 0.29 | comms: 17.99 | reduce_grads: 0.19 | step: 348.10 | _step_clipping: 0.12 | _step_step: 346.37 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.364 | iteration 4260/ 143000 | elapsed time per iteration (ms): 62575.0 | learning rate: 5.994E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.886918E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 01:50:53,553] [INFO] [logging.py:60:log_dist] [Rank 0] step=4270, skipped=0, lr=[0.0005994162663637957, 0.0005994162663637957], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4270 loss: 2.8886 iter time (s): 62.131 samples/sec: 16.481 %comms: 0.002878917507922219 %optimizer_step 0.05705462455232196 %forward: 23.401719172741238 %backward: 62.8243831133056 [2025-03-28 01:50:53,554] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15531.59 | forward: 145396.20 | backward_microstep: 390339.90 | backward: 390331.44 | backward_inner_microstep: 390315.81 | backward_inner: 390310.05 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 354.48 | _step_clipping: 0.12 | _step_step: 352.87 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.481 | iteration 4270/ 143000 | elapsed time per iteration (ms): 62131.1 | learning rate: 5.994E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.888677E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 02:01:17,297] [INFO] [logging.py:60:log_dist] [Rank 0] step=4280, skipped=0, lr=[0.000599412149671625, 0.000599412149671625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4280 loss: 2.8841 iter time (s): 62.374 samples/sec: 16.417 %comms: 0.002921507337061165 %optimizer_step 0.05578224761965955 %forward: 23.292320593264073 %backward: 62.58258640182596 [2025-03-28 02:01:17,298] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18038.70 | forward: 145283.21 | backward_microstep: 390362.11 | backward: 390351.78 | backward_inner_microstep: 390336.19 | backward_inner: 390330.19 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.56 | reduce_tied_grads: 0.34 | comms: 18.22 | reduce_grads: 0.54 | step: 347.94 | _step_clipping: 0.13 | _step_step: 346.23 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.417 | iteration 4280/ 143000 | elapsed time per iteration (ms): 62374.4 | learning rate: 5.994E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.889799E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 02:11:41,320] [INFO] [logging.py:60:log_dist] [Rank 0] step=4290, skipped=0, lr=[0.0005994080185284814, 0.0005994080185284814], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4290 loss: 2.9042 iter time (s): 62.402 samples/sec: 16.410 %comms: 0.002875885354875744 %optimizer_step 0.056467967488300026 %forward: 23.280715167579068 %backward: 62.55110693106602 [2025-03-28 02:11:41,320] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18369.50 | forward: 145275.55 | backward_microstep: 390336.70 | backward: 390329.35 | backward_inner_microstep: 390313.92 | backward_inner: 390308.03 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.18 | step: 352.37 | _step_clipping: 0.13 | _step_step: 350.63 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.410 | iteration 4290/ 143000 | elapsed time per iteration (ms): 62402.2 | learning rate: 5.994E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.891220E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 02:22:05,597] [INFO] [logging.py:60:log_dist] [Rank 0] step=4300, skipped=0, lr=[0.0005994038729345642, 0.0005994038729345642], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4300 loss: 2.9067 iter time (s): 62.427 samples/sec: 16.403 %comms: 0.0028762382668141937 %optimizer_step 0.05752308491132948 %forward: 23.294468799800217 %backward: 62.54030533148032 [2025-03-28 02:22:05,598] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18292.39 | forward: 145420.78 | backward_microstep: 390432.40 | backward: 390421.43 | backward_inner_microstep: 390405.86 | backward_inner: 390399.81 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.54 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.18 | step: 359.10 | _step_clipping: 0.13 | _step_step: 357.38 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.403 | iteration 4300/ 143000 | elapsed time per iteration (ms): 62427.7 | learning rate: 5.994E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.890389E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 02:32:29,922] [INFO] [logging.py:60:log_dist] [Rank 0] step=4310, skipped=0, lr=[0.0005993997128900736, 0.0005993997128900736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4310 loss: 2.8810 iter time (s): 62.432 samples/sec: 16.402 %comms: 0.0028656298312906346 %optimizer_step 0.055156318919633324 %forward: 23.29583424824641 %backward: 62.539324920859606 [2025-03-28 02:32:29,923] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18334.70 | forward: 145440.48 | backward_microstep: 390455.21 | backward: 390445.33 | backward_inner_microstep: 390429.67 | backward_inner: 390423.60 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 344.35 | _step_clipping: 0.12 | _step_step: 342.80 | _step_zero_grad: 0.46 | _step_check_overflow: 0.44 samples/sec: 16.402 | iteration 4310/ 143000 | elapsed time per iteration (ms): 62432.6 | learning rate: 5.994E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.894597E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 02:42:52,812] [INFO] [logging.py:60:log_dist] [Rank 0] step=4320, skipped=0, lr=[0.0005993955383952102, 0.0005993955383952102], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4320 loss: 2.8559 iter time (s): 62.288 samples/sec: 16.440 %comms: 0.002875451044302267 %optimizer_step 0.055683611087411426 %forward: 23.33275278360918 %backward: 62.66381012322905 [2025-03-28 02:42:52,813] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17178.75 | forward: 145335.94 | backward_microstep: 390332.82 | backward: 390322.73 | backward_inner_microstep: 390305.44 | backward_inner: 390297.44 | backward_allreduce_microstep: 9.28 | backward_allreduce: 2.73 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 346.84 | _step_clipping: 0.13 | _step_step: 345.18 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.440 | iteration 4320/ 143000 | elapsed time per iteration (ms): 62288.9 | learning rate: 5.994E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.888044E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 02:53:14,324] [INFO] [logging.py:60:log_dist] [Rank 0] step=4330, skipped=0, lr=[0.0005993913494501757, 0.0005993913494501757], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4330 loss: 2.8942 iter time (s): 62.151 samples/sec: 16.476 %comms: 0.004096153084776049 %optimizer_step 0.05760427701693698 %forward: 23.420301144864506 %backward: 62.83499641471904 [2025-03-28 02:53:14,325] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15330.12 | forward: 145558.70 | backward_microstep: 390534.96 | backward: 390523.61 | backward_inner_microstep: 390507.85 | backward_inner: 390501.70 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 25.46 | reduce_grads: 0.19 | step: 358.01 | _step_clipping: 0.11 | _step_step: 356.25 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.476 | iteration 4330/ 143000 | elapsed time per iteration (ms): 62151.2 | learning rate: 5.994E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.887686E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 03:03:41,601] [INFO] [logging.py:60:log_dist] [Rank 0] step=4340, skipped=0, lr=[0.0005993871460551721, 0.0005993871460551721], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4340 loss: 2.8734 iter time (s): 62.727 samples/sec: 16.325 %comms: 0.002851839660452933 %optimizer_step 0.055545517215431085 %forward: 23.15575980751331 %backward: 62.21754530467772 [2025-03-28 03:03:41,602] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21722.75 | forward: 145249.54 | backward_microstep: 390281.34 | backward: 390273.08 | backward_inner_microstep: 390257.70 | backward_inner: 390251.87 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 348.42 | _step_clipping: 0.13 | _step_step: 346.74 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.325 | iteration 4340/ 143000 | elapsed time per iteration (ms): 62727.7 | learning rate: 5.994E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.882827E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 03:14:13,292] [INFO] [logging.py:60:log_dist] [Rank 0] step=4350, skipped=0, lr=[0.0005993829282104024, 0.0005993829282104024], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4350 loss: 2.8876 iter time (s): 63.168 samples/sec: 16.211 %comms: 0.0028444466428813385 %optimizer_step 0.05578045053268241 %forward: 23.026615886523576 %backward: 61.78582165549934 [2025-03-28 03:14:13,293] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25904.91 | forward: 145455.65 | backward_microstep: 390298.71 | backward: 390291.69 | backward_inner_microstep: 390274.56 | backward_inner: 390268.62 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 352.36 | _step_clipping: 0.14 | _step_step: 350.63 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.210 | iteration 4350/ 143000 | elapsed time per iteration (ms): 63169.0 | learning rate: 5.994E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.888704E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 03:24:45,059] [INFO] [logging.py:60:log_dist] [Rank 0] step=4360, skipped=0, lr=[0.0005993786959160701, 0.0005993786959160701], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4360 loss: 2.8657 iter time (s): 63.176 samples/sec: 16.209 %comms: 0.00283772768734664 %optimizer_step 0.055556272906325435 %forward: 22.984536638994367 %backward: 61.76498672859248 [2025-03-28 03:24:45,059] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26346.36 | forward: 145207.26 | backward_microstep: 390213.20 | backward: 390206.88 | backward_inner_microstep: 390191.70 | backward_inner: 390185.95 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.52 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.18 | step: 350.98 | _step_clipping: 0.14 | _step_step: 349.24 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.209 | iteration 4360/ 143000 | elapsed time per iteration (ms): 63176.7 | learning rate: 5.994E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.877971E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 03:35:19,559] [INFO] [logging.py:60:log_dist] [Rank 0] step=4370, skipped=0, lr=[0.0005993744491723794, 0.0005993744491723794], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4370 loss: 2.8870 iter time (s): 63.449 samples/sec: 16.139 %comms: 0.0028195997309719257 %optimizer_step 0.05515168186813021 %forward: 22.905749815045706 %backward: 61.5069505406724 [2025-03-28 03:35:19,560] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28873.58 | forward: 145335.80 | backward_microstep: 390265.61 | backward: 390258.43 | backward_inner_microstep: 390239.99 | backward_inner: 390234.22 | backward_allreduce_microstep: 8.88 | backward_allreduce: 4.13 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.19 | step: 349.93 | _step_clipping: 0.12 | _step_step: 348.31 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.139 | iteration 4370/ 143000 | elapsed time per iteration (ms): 63450.0 | learning rate: 5.994E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.883446E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 03:45:44,908] [INFO] [logging.py:60:log_dist] [Rank 0] step=4380, skipped=0, lr=[0.0005993701879795354, 0.0005993701879795354], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4380 loss: 2.8575 iter time (s): 62.534 samples/sec: 16.375 %comms: 0.0028678771829813996 %optimizer_step 0.05643146268789646 %forward: 23.24782075495005 %backward: 62.407814529733876 [2025-03-28 03:45:44,909] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19709.36 | forward: 145378.74 | backward_microstep: 390269.95 | backward: 390263.22 | backward_inner_microstep: 390247.95 | backward_inner: 390242.23 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.54 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 352.89 | _step_clipping: 0.13 | _step_step: 351.17 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.375 | iteration 4380/ 143000 | elapsed time per iteration (ms): 62534.9 | learning rate: 5.994E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.876080E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 03:56:14,163] [INFO] [logging.py:60:log_dist] [Rank 0] step=4390, skipped=0, lr=[0.0005993659123377437, 0.0005993659123377437], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4390 loss: 2.8766 iter time (s): 62.925 samples/sec: 16.273 %comms: 0.0028436720215720174 %optimizer_step 0.05549339638142546 %forward: 23.08507470102702 %backward: 62.01841030216466 [2025-03-28 03:56:14,164] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23749.32 | forward: 145262.71 | backward_microstep: 390257.26 | backward: 390250.52 | backward_inner_microstep: 390233.89 | backward_inner: 390228.21 | backward_allreduce_microstep: 8.86 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 349.19 | _step_clipping: 0.12 | _step_step: 347.50 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.273 | iteration 4390/ 143000 | elapsed time per iteration (ms): 62925.5 | learning rate: 5.994E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.886144E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 04:06:35,431] [INFO] [logging.py:60:log_dist] [Rank 0] step=4400, skipped=0, lr=[0.0005993616222472108, 0.0005993616222472108], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4400 loss: 2.8777 iter time (s): 62.126 samples/sec: 16.483 %comms: 0.0028744715455395107 %optimizer_step 0.05624545885198454 %forward: 23.40256861206946 %backward: 62.82710168053725 [2025-03-28 04:06:35,432] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15548.82 | forward: 145391.50 | backward_microstep: 390328.79 | backward: 390321.54 | backward_inner_microstep: 390306.27 | backward_inner: 390300.44 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.51 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 349.43 | _step_clipping: 0.12 | _step_step: 347.77 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.482 | iteration 4400/ 143000 | elapsed time per iteration (ms): 62126.8 | learning rate: 5.994E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.871085E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 04:16:56,244] [INFO] [logging.py:60:log_dist] [Rank 0] step=4410, skipped=0, lr=[0.0005993573177081434, 0.0005993573177081434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4410 loss: 2.8715 iter time (s): 62.081 samples/sec: 16.495 %comms: 0.0028828798260149883 %optimizer_step 0.057425210275832 %forward: 23.409032577039756 %backward: 62.86281481959042 [2025-03-28 04:16:56,245] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15215.99 | forward: 145324.99 | backward_microstep: 390264.24 | backward: 390256.96 | backward_inner_microstep: 390241.74 | backward_inner: 390235.91 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 356.50 | _step_clipping: 0.11 | _step_step: 354.80 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.495 | iteration 4410/ 143000 | elapsed time per iteration (ms): 62081.3 | learning rate: 5.994E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.870799E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 04:27:18,003] [INFO] [logging.py:60:log_dist] [Rank 0] step=4420, skipped=0, lr=[0.0005993529987207495, 0.0005993529987207495], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4420 loss: 2.8774 iter time (s): 62.175 samples/sec: 16.470 %comms: 0.002873662563640726 %optimizer_step 0.056209207868119386 %forward: 23.376788827432485 %backward: 62.76865269847679 [2025-03-28 04:27:18,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16151.18 | forward: 145345.93 | backward_microstep: 390273.37 | backward: 390266.09 | backward_inner_microstep: 390250.82 | backward_inner: 390244.99 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.52 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 349.48 | _step_clipping: 0.12 | _step_step: 347.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.469 | iteration 4420/ 143000 | elapsed time per iteration (ms): 62175.8 | learning rate: 5.994E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.869906E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 04:37:40,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=4430, skipped=0, lr=[0.0005993486652852377, 0.0005993486652852377], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4430 loss: 2.8828 iter time (s): 62.293 samples/sec: 16.439 %comms: 0.002885280471860709 %optimizer_step 0.0564711219882602 %forward: 23.3178345706313 %backward: 62.6385772682108 [2025-03-28 04:37:40,935] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17476.72 | forward: 145253.03 | backward_microstep: 390199.17 | backward: 390192.47 | backward_inner_microstep: 390177.45 | backward_inner: 390171.80 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.18 | step: 351.77 | _step_clipping: 0.13 | _step_step: 350.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.438 | iteration 4430/ 143000 | elapsed time per iteration (ms): 62293.2 | learning rate: 5.993E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.870976E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 04:48:10,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=4440, skipped=0, lr=[0.000599344317401817, 0.000599344317401817], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4440 loss: 2.8788 iter time (s): 62.921 samples/sec: 16.274 %comms: 0.0028543596693063147 %optimizer_step 0.05580587845804292 %forward: 23.106401496177213 %backward: 62.03454595938626 [2025-03-28 04:48:10,148] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23475.83 | forward: 145387.11 | backward_microstep: 390332.94 | backward: 390325.76 | backward_inner_microstep: 390310.54 | backward_inner: 390304.29 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.18 | step: 351.13 | _step_clipping: 0.14 | _step_step: 349.38 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.274 | iteration 4440/ 143000 | elapsed time per iteration (ms): 62921.3 | learning rate: 5.993E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.872760E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 04:58:37,865] [INFO] [logging.py:60:log_dist] [Rank 0] step=4450, skipped=0, lr=[0.0005993399550706971, 0.0005993399550706971], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4450 loss: 2.8777 iter time (s): 62.771 samples/sec: 16.313 %comms: 0.002858611602510074 %optimizer_step 0.06040033388243197 %forward: 23.14983574936264 %backward: 62.17713445781212 [2025-03-28 04:58:37,866] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22075.81 | forward: 145314.39 | backward_microstep: 390300.62 | backward: 390293.57 | backward_inner_microstep: 390278.25 | backward_inner: 390272.47 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.53 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.18 | step: 379.14 | _step_clipping: 0.13 | _step_step: 377.43 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.313 | iteration 4450/ 143000 | elapsed time per iteration (ms): 62771.8 | learning rate: 5.993E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.876627E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 05:09:01,352] [INFO] [logging.py:60:log_dist] [Rank 0] step=4460, skipped=0, lr=[0.0005993355782920887, 0.0005993355782920887], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4460 loss: 2.8707 iter time (s): 62.348 samples/sec: 16.424 %comms: 0.0028887580803287666 %optimizer_step 0.05653446748645478 %forward: 23.2958916359668 %backward: 62.622175952941895 [2025-03-28 05:09:01,353] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17788.98 | forward: 145245.43 | backward_microstep: 390446.80 | backward: 390437.30 | backward_inner_microstep: 390422.29 | backward_inner: 390416.51 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.46 | reduce_tied_grads: 0.34 | comms: 18.01 | reduce_grads: 0.20 | step: 352.48 | _step_clipping: 0.14 | _step_step: 350.74 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.424 | iteration 4460/ 143000 | elapsed time per iteration (ms): 62348.7 | learning rate: 5.993E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.866755E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 05:19:28,231] [INFO] [logging.py:60:log_dist] [Rank 0] step=4470, skipped=0, lr=[0.0005993311870662032, 0.0005993311870662032], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4470 loss: 2.8696 iter time (s): 62.687 samples/sec: 16.335 %comms: 0.003061504011356397 %optimizer_step 0.05682555507622402 %forward: 23.177354242558863 %backward: 62.261400354100424 [2025-03-28 05:19:28,232] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21254.78 | forward: 145292.58 | backward_microstep: 390307.11 | backward: 390299.91 | backward_inner_microstep: 390280.56 | backward_inner: 390274.63 | backward_allreduce_microstep: 11.29 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 19.19 | reduce_grads: 0.19 | step: 356.22 | _step_clipping: 0.12 | _step_step: 354.52 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.335 | iteration 4470/ 143000 | elapsed time per iteration (ms): 62687.8 | learning rate: 5.993E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.875463E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 05:29:57,322] [INFO] [logging.py:60:log_dist] [Rank 0] step=4480, skipped=0, lr=[0.0005993267813932521, 0.0005993267813932521], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4480 loss: 2.8395 iter time (s): 62.909 samples/sec: 16.278 %comms: 0.0028425956744827555 %optimizer_step 0.05533601911351569 %forward: 23.119455882466784 %backward: 62.03559996117873 [2025-03-28 05:29:57,322] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23375.87 | forward: 145441.06 | backward_microstep: 390264.60 | backward: 390256.74 | backward_inner_microstep: 390241.27 | backward_inner: 390233.39 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 348.11 | _step_clipping: 0.12 | _step_step: 346.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.277 | iteration 4480/ 143000 | elapsed time per iteration (ms): 62909.0 | learning rate: 5.993E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.855394E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 05:40:16,250] [INFO] [logging.py:60:log_dist] [Rank 0] step=4490, skipped=0, lr=[0.0005993223612734486, 0.0005993223612734486], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4490 loss: 2.8981 iter time (s): 61.892 samples/sec: 16.545 %comms: 0.002893351027950069 %optimizer_step 0.0557460644380068 %forward: 23.475812860083284 %backward: 63.063409666471834 [2025-03-28 05:40:16,251] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13302.55 | forward: 145297.24 | backward_microstep: 390321.74 | backward: 390314.04 | backward_inner_microstep: 390298.89 | backward_inner: 390293.00 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 345.03 | _step_clipping: 0.12 | _step_step: 343.30 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.545 | iteration 4490/ 143000 | elapsed time per iteration (ms): 61892.9 | learning rate: 5.993E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.883633E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 05:50:42,830] [INFO] [logging.py:60:log_dist] [Rank 0] step=4500, skipped=0, lr=[0.0005993179267070055, 0.0005993179267070055], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4500 loss: 2.8682 iter time (s): 62.657 samples/sec: 16.343 %comms: 0.0028495332640918352 %optimizer_step 0.059337277062206366 %forward: 23.18362626976935 %backward: 62.27182057122778 [2025-03-28 05:50:42,831] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21123.90 | forward: 145262.72 | backward_microstep: 390189.13 | backward: 390179.42 | backward_inner_microstep: 390153.79 | backward_inner: 390148.17 | backward_allreduce_microstep: 16.29 | backward_allreduce: 6.12 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 371.79 | _step_clipping: 0.12 | _step_step: 370.09 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.343 | iteration 4500/ 143000 | elapsed time per iteration (ms): 62658.0 | learning rate: 5.993E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.879526E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 06:01:11,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=4510, skipped=0, lr=[0.0005993134776941373, 0.0005993134776941373], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4510 loss: 2.8722 iter time (s): 62.860 samples/sec: 16.290 %comms: 0.0028479571420378384 %optimizer_step 0.05537305888905025 %forward: 23.106348188180135 %backward: 62.077500230954655 [2025-03-28 06:01:11,440] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23172.60 | forward: 145247.44 | backward_microstep: 390228.01 | backward: 390221.69 | backward_inner_microstep: 390206.70 | backward_inner: 390201.19 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.45 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 348.08 | _step_clipping: 0.13 | _step_step: 346.40 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.290 | iteration 4510/ 143000 | elapsed time per iteration (ms): 62861.0 | learning rate: 5.993E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.872274E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 06:11:33,872] [INFO] [logging.py:60:log_dist] [Rank 0] step=4520, skipped=0, lr=[0.0005993090142350582, 0.0005993090142350582], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4520 loss: 2.8564 iter time (s): 62.243 samples/sec: 16.452 %comms: 0.002885987551981048 %optimizer_step 0.05535386408067177 %forward: 23.342386156962167 %backward: 62.692107871217964 [2025-03-28 06:11:33,873] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16955.93 | forward: 145289.36 | backward_microstep: 390219.92 | backward: 390212.72 | backward_inner_microstep: 390195.47 | backward_inner: 390189.69 | backward_allreduce_microstep: 9.35 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.18 | step: 344.54 | _step_clipping: 0.13 | _step_step: 342.86 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.452 | iteration 4520/ 143000 | elapsed time per iteration (ms): 62243.3 | learning rate: 5.993E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.858297E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 06:21:59,101] [INFO] [logging.py:60:log_dist] [Rank 0] step=4530, skipped=0, lr=[0.0005993045363299842, 0.0005993045363299842], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4530 loss: 2.8703 iter time (s): 62.522 samples/sec: 16.378 %comms: 0.0028540532258222013 %optimizer_step 0.055162231954446445 %forward: 23.229693244560156 %backward: 62.41160684675807 [2025-03-28 06:21:59,102] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19809.87 | forward: 145237.41 | backward_microstep: 390218.50 | backward: 390211.78 | backward_inner_microstep: 390194.63 | backward_inner: 390188.79 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.84 | reduce_grads: 0.18 | step: 344.89 | _step_clipping: 0.13 | _step_step: 343.23 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.378 | iteration 4530/ 143000 | elapsed time per iteration (ms): 62522.9 | learning rate: 5.993E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.861435E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 06:32:26,776] [INFO] [logging.py:60:log_dist] [Rank 0] step=4540, skipped=0, lr=[0.0005993000439791311, 0.0005993000439791311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4540 loss: 2.8620 iter time (s): 62.767 samples/sec: 16.314 %comms: 0.002865000170916205 %optimizer_step 0.056115638747072295 %forward: 23.14055494913147 %backward: 62.16939789668443 [2025-03-28 06:32:26,776] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22235.59 | forward: 145246.12 | backward_microstep: 390225.34 | backward: 390218.11 | backward_inner_microstep: 390203.07 | backward_inner: 390197.03 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.46 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.20 | step: 352.22 | _step_clipping: 0.17 | _step_step: 350.48 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.314 | iteration 4540/ 143000 | elapsed time per iteration (ms): 62767.5 | learning rate: 5.993E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.858625E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 06:42:50,277] [INFO] [logging.py:60:log_dist] [Rank 0] step=4550, skipped=0, lr=[0.0005992955371827158, 0.0005992955371827158], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4550 loss: 2.8837 iter time (s): 62.350 samples/sec: 16.424 %comms: 0.0028670826991875477 %optimizer_step 0.055879777545463194 %forward: 23.30912319878602 %backward: 62.590109769215864 [2025-03-28 06:42:50,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17963.12 | forward: 145331.47 | backward_microstep: 390253.24 | backward: 390246.88 | backward_inner_microstep: 390231.81 | backward_inner: 390226.14 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 348.41 | _step_clipping: 0.12 | _step_step: 346.76 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.423 | iteration 4550/ 143000 | elapsed time per iteration (ms): 62350.1 | learning rate: 5.993E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.870309E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 06:53:16,287] [INFO] [logging.py:60:log_dist] [Rank 0] step=4560, skipped=0, lr=[0.0005992910159409558, 0.0005992910159409558], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4560 loss: 2.8627 iter time (s): 62.600 samples/sec: 16.358 %comms: 0.0028527385727771808 %optimizer_step 0.055035768515473614 %forward: 23.199011501157397 %backward: 62.33686408065733 [2025-03-28 06:53:16,287] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20590.46 | forward: 145226.82 | backward_microstep: 390239.39 | backward: 390231.46 | backward_inner_microstep: 390216.42 | backward_inner: 390210.82 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.17 | step: 344.53 | _step_clipping: 0.12 | _step_step: 342.83 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.358 | iteration 4560/ 143000 | elapsed time per iteration (ms): 62601.0 | learning rate: 5.993E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.864629E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 07:03:42,699] [INFO] [logging.py:60:log_dist] [Rank 0] step=4570, skipped=0, lr=[0.0005992864802540691, 0.0005992864802540691], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4570 loss: 2.8673 iter time (s): 62.641 samples/sec: 16.347 %comms: 0.002865219356406191 %optimizer_step 0.055824448177883484 %forward: 23.18278094384599 %backward: 62.29068309250538 [2025-03-28 07:03:42,699] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21067.77 | forward: 145218.38 | backward_microstep: 390199.74 | backward: 390192.71 | backward_inner_microstep: 390177.78 | backward_inner: 390172.16 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.19 | step: 349.69 | _step_clipping: 0.12 | _step_step: 348.01 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.347 | iteration 4570/ 143000 | elapsed time per iteration (ms): 62641.2 | learning rate: 5.993E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.860313E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 07:14:08,641] [INFO] [logging.py:60:log_dist] [Rank 0] step=4580, skipped=0, lr=[0.000599281930122275, 0.000599281930122275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4580 loss: 2.8417 iter time (s): 62.594 samples/sec: 16.359 %comms: 0.002914522035983615 %optimizer_step 0.05564890873076182 %forward: 23.20924631116057 %backward: 62.35089235635638 [2025-03-28 07:14:08,642] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20467.68 | forward: 145275.28 | backward_microstep: 390284.91 | backward: 390277.36 | backward_inner_microstep: 390261.74 | backward_inner: 390255.85 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.51 | reduce_tied_grads: 0.29 | comms: 18.24 | reduce_grads: 0.19 | step: 348.33 | _step_clipping: 0.15 | _step_step: 346.47 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.359 | iteration 4580/ 143000 | elapsed time per iteration (ms): 62594.3 | learning rate: 5.993E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.853626E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 07:24:26,775] [INFO] [logging.py:60:log_dist] [Rank 0] step=4590, skipped=0, lr=[0.0005992773655457928, 0.0005992773655457928], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4590 loss: 2.8554 iter time (s): 61.813 samples/sec: 16.566 %comms: 0.002893831841362565 %optimizer_step 0.055748363250855075 %forward: 23.499736455564438 %backward: 63.12408000805919 [2025-03-28 07:24:26,776] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12765.47 | forward: 145258.51 | backward_microstep: 390194.02 | backward: 390187.78 | backward_inner_microstep: 390170.75 | backward_inner: 390164.87 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 344.60 | _step_clipping: 0.12 | _step_step: 342.94 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.566 | iteration 4590/ 143000 | elapsed time per iteration (ms): 61813.4 | learning rate: 5.993E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.850050E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 07:34:56,749] [INFO] [logging.py:60:log_dist] [Rank 0] step=4600, skipped=0, lr=[0.0005992727865248431, 0.0005992727865248431], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4600 loss: 2.8523 iter time (s): 62.997 samples/sec: 16.255 %comms: 0.0028383444011627003 %optimizer_step 0.05531832832147256 %forward: 23.075603924702186 %backward: 61.94828763880486 [2025-03-28 07:34:56,750] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24383.34 | forward: 145369.06 | backward_microstep: 390262.26 | backward: 390254.75 | backward_inner_microstep: 390239.40 | backward_inner: 390233.46 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 17.88 | reduce_grads: 0.18 | step: 348.49 | _step_clipping: 0.12 | _step_step: 346.79 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.255 | iteration 4600/ 143000 | elapsed time per iteration (ms): 62997.4 | learning rate: 5.993E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.852002E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 07:45:19,441] [INFO] [logging.py:60:log_dist] [Rank 0] step=4610, skipped=0, lr=[0.0005992681930596466, 0.0005992681930596466], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4610 loss: 2.8721 iter time (s): 62.269 samples/sec: 16.445 %comms: 0.002872916595836546 %optimizer_step 0.056104787815076745 %forward: 23.307488219603258 %backward: 62.63583839378749 [2025-03-28 07:45:19,442] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17628.43 | forward: 145132.56 | backward_microstep: 390031.36 | backward: 390024.85 | backward_inner_microstep: 390009.60 | backward_inner: 390003.71 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.18 | step: 349.36 | _step_clipping: 0.14 | _step_step: 347.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.445 | iteration 4610/ 143000 | elapsed time per iteration (ms): 62269.2 | learning rate: 5.993E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.860464E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 07:55:41,273] [INFO] [logging.py:60:log_dist] [Rank 0] step=4620, skipped=0, lr=[0.0005992635851504253, 0.0005992635851504253], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4620 loss: 2.8270 iter time (s): 62.183 samples/sec: 16.468 %comms: 0.002883097225045741 %optimizer_step 0.05557106158352111 %forward: 23.336963266294728 %backward: 62.717738196880234 [2025-03-28 07:55:41,274] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16853.63 | forward: 145115.60 | backward_microstep: 390002.04 | backward: 389996.01 | backward_inner_microstep: 389981.29 | backward_inner: 389975.91 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.93 | reduce_grads: 0.18 | step: 345.56 | _step_clipping: 0.11 | _step_step: 343.96 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.467 | iteration 4620/ 143000 | elapsed time per iteration (ms): 62183.2 | learning rate: 5.993E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.848559E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 08:06:06,414] [INFO] [logging.py:60:log_dist] [Rank 0] step=4630, skipped=0, lr=[0.0005992589627974013, 0.0005992589627974013], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4630 loss: 2.8422 iter time (s): 62.514 samples/sec: 16.380 %comms: 0.0028685657954125305 %optimizer_step 0.056962316259381994 %forward: 23.27020306158655 %backward: 62.43176715284163 [2025-03-28 08:06:06,415] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19431.64 | forward: 145470.23 | backward_microstep: 390290.11 | backward: 390282.96 | backward_inner_microstep: 390266.05 | backward_inner: 390260.21 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.55 | reduce_tied_grads: 0.27 | comms: 17.93 | reduce_grads: 0.18 | step: 356.09 | _step_clipping: 0.13 | _step_step: 354.26 | _step_zero_grad: 0.48 | _step_check_overflow: 0.68 samples/sec: 16.380 | iteration 4630/ 143000 | elapsed time per iteration (ms): 62514.1 | learning rate: 5.993E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.849399E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 08:16:33,704] [INFO] [logging.py:60:log_dist] [Rank 0] step=4640, skipped=0, lr=[0.000599254326000798, 0.000599254326000798], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4640 loss: 2.8560 iter time (s): 62.728 samples/sec: 16.324 %comms: 0.00287816213863335 %optimizer_step 0.058373765157696475 %forward: 23.157873751978965 %backward: 62.22529854668456 [2025-03-28 08:16:33,704] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21718.23 | forward: 145265.61 | backward_microstep: 390337.24 | backward: 390329.26 | backward_inner_microstep: 390311.86 | backward_inner: 390302.14 | backward_allreduce_microstep: 9.22 | backward_allreduce: 2.58 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.20 | step: 366.17 | _step_clipping: 0.14 | _step_step: 364.38 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.324 | iteration 4640/ 143000 | elapsed time per iteration (ms): 62729.0 | learning rate: 5.993E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.852278E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 08:26:55,992] [INFO] [logging.py:60:log_dist] [Rank 0] step=4650, skipped=0, lr=[0.0005992496747608389, 0.0005992496747608389], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4650 loss: 2.8536 iter time (s): 62.228 samples/sec: 16.456 %comms: 0.002885391750409971 %optimizer_step 0.05637253195036794 %forward: 23.34336013288333 %backward: 62.70010671187105 [2025-03-28 08:26:55,993] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16912.28 | forward: 145261.77 | backward_microstep: 390178.80 | backward: 390172.12 | backward_inner_microstep: 390156.96 | backward_inner: 390149.14 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.18 | step: 350.80 | _step_clipping: 0.11 | _step_step: 349.10 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.455 | iteration 4650/ 143000 | elapsed time per iteration (ms): 62228.9 | learning rate: 5.992E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.850155E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 08:37:17,745] [INFO] [logging.py:60:log_dist] [Rank 0] step=4660, skipped=0, lr=[0.0005992450090777486, 0.0005992450090777486], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4660 loss: 2.8535 iter time (s): 62.175 samples/sec: 16.470 %comms: 0.0028759172149969863 %optimizer_step 0.05584102630611079 %forward: 23.36380646619596 %backward: 62.75715027026641 [2025-03-28 08:37:17,745] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16345.58 | forward: 145263.66 | backward_microstep: 390196.95 | backward: 390190.43 | backward_inner_microstep: 390175.65 | backward_inner: 390170.08 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 347.19 | _step_clipping: 0.13 | _step_step: 345.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.470 | iteration 4660/ 143000 | elapsed time per iteration (ms): 62175.2 | learning rate: 5.992E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.844984E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 08:47:45,110] [INFO] [logging.py:60:log_dist] [Rank 0] step=4670, skipped=0, lr=[0.0005992403289517524, 0.0005992403289517524], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4670 loss: 2.8292 iter time (s): 62.736 samples/sec: 16.322 %comms: 0.002946528215819738 %optimizer_step 0.057910815414442166 %forward: 23.16420774675631 %backward: 62.2090410366834 [2025-03-28 08:47:45,110] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21809.54 | forward: 145322.72 | backward_microstep: 390280.96 | backward: 390273.97 | backward_inner_microstep: 390256.92 | backward_inner: 390249.26 | backward_allreduce_microstep: 9.11 | backward_allreduce: 4.30 | reduce_tied_grads: 0.31 | comms: 18.49 | reduce_grads: 0.20 | step: 363.31 | _step_clipping: 0.14 | _step_step: 361.48 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.322 | iteration 4670/ 143000 | elapsed time per iteration (ms): 62736.5 | learning rate: 5.992E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.841296E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 08:58:08,591] [INFO] [logging.py:60:log_dist] [Rank 0] step=4680, skipped=0, lr=[0.000599235634383076, 0.000599235634383076], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4680 loss: 2.8576 iter time (s): 62.348 samples/sec: 16.424 %comms: 0.002867443125625999 %optimizer_step 0.055474938622032864 %forward: 23.30907970640203 %backward: 62.602818036451524 [2025-03-28 08:58:08,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17915.70 | forward: 145326.50 | backward_microstep: 390322.30 | backward: 390313.49 | backward_inner_microstep: 390298.18 | backward_inner: 390292.08 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.54 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 345.87 | _step_clipping: 0.12 | _step_step: 344.30 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.424 | iteration 4680/ 143000 | elapsed time per iteration (ms): 62348.2 | learning rate: 5.992E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.846906E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 09:08:31,650] [INFO] [logging.py:60:log_dist] [Rank 0] step=4690, skipped=0, lr=[0.0005992309253719461, 0.0005992309253719461], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4690 loss: 2.8703 iter time (s): 62.305 samples/sec: 16.435 %comms: 0.002884387552008552 %optimizer_step 0.05682144368129603 %forward: 23.320657664928728 %backward: 62.64535276833312 [2025-03-28 09:08:31,651] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17532.17 | forward: 145300.17 | backward_microstep: 390320.80 | backward: 390314.06 | backward_inner_microstep: 390298.78 | backward_inner: 390292.92 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.51 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.19 | step: 354.03 | _step_clipping: 0.13 | _step_step: 352.26 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.435 | iteration 4690/ 143000 | elapsed time per iteration (ms): 62305.9 | learning rate: 5.992E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.851234E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 09:18:59,486] [INFO] [logging.py:60:log_dist] [Rank 0] step=4700, skipped=0, lr=[0.00059922620191859, 0.00059922620191859], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4700 loss: 2.8391 iter time (s): 62.783 samples/sec: 16.310 %comms: 0.0028580371425617384 %optimizer_step 0.056115822507169066 %forward: 23.17421650438286 %backward: 62.18787000641346 [2025-03-28 09:18:59,486] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22000.70 | forward: 145494.73 | backward_microstep: 390442.44 | backward: 390434.24 | backward_inner_microstep: 390418.49 | backward_inner: 390412.36 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 352.31 | _step_clipping: 0.13 | _step_step: 350.60 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.310 | iteration 4700/ 143000 | elapsed time per iteration (ms): 62783.6 | learning rate: 5.992E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.844518E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 09:29:30,010] [INFO] [logging.py:60:log_dist] [Rank 0] step=4710, skipped=0, lr=[0.0005992214640232356, 0.0005992214640232356], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4710 loss: 2.8445 iter time (s): 63.052 samples/sec: 16.241 %comms: 0.002859197157819327 %optimizer_step 0.05612884518121484 %forward: 23.0804687673819 %backward: 61.91515059083519 [2025-03-28 09:29:30,011] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24702.45 | forward: 145526.75 | backward_microstep: 390394.11 | backward: 390386.81 | backward_inner_microstep: 390371.26 | backward_inner: 390363.39 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 18.03 | reduce_grads: 0.20 | step: 353.90 | _step_clipping: 0.14 | _step_step: 352.21 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.240 | iteration 4710/ 143000 | elapsed time per iteration (ms): 63052.4 | learning rate: 5.992E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.842460E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 09:39:55,209] [INFO] [logging.py:60:log_dist] [Rank 0] step=4720, skipped=0, lr=[0.0005992167116861115, 0.0005992167116861115], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4720 loss: 2.8457 iter time (s): 62.519 samples/sec: 16.379 %comms: 0.0028920973180321355 %optimizer_step 0.055528932058701685 %forward: 23.249491018443777 %backward: 62.41391446648762 [2025-03-28 09:39:55,209] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19746.64 | forward: 145354.18 | backward_microstep: 390216.59 | backward: 390207.39 | backward_inner_microstep: 390191.81 | backward_inner: 390185.89 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.19 | step: 347.16 | _step_clipping: 0.13 | _step_step: 345.40 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.379 | iteration 4720/ 143000 | elapsed time per iteration (ms): 62519.8 | learning rate: 5.992E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.844264E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 09:50:19,637] [INFO] [logging.py:60:log_dist] [Rank 0] step=4730, skipped=0, lr=[0.0005992119449074473, 0.0005992119449074473], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4730 loss: 2.8468 iter time (s): 62.442 samples/sec: 16.399 %comms: 0.002868210275054588 %optimizer_step 0.057920482281896495 %forward: 23.282602693124783 %backward: 62.49742465690717 [2025-03-28 09:50:19,638] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18881.64 | forward: 145381.93 | backward_microstep: 390254.93 | backward: 390248.31 | backward_inner_microstep: 390231.33 | backward_inner: 390225.54 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 361.67 | _step_clipping: 0.12 | _step_step: 359.91 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.399 | iteration 4730/ 143000 | elapsed time per iteration (ms): 62442.8 | learning rate: 5.992E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.847712E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 10:00:41,995] [INFO] [logging.py:60:log_dist] [Rank 0] step=4740, skipped=0, lr=[0.0005992071636874729, 0.0005992071636874729], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4740 loss: 2.8330 iter time (s): 62.235 samples/sec: 16.454 %comms: 0.0028944177352547866 %optimizer_step 0.05620087016661679 %forward: 23.377502521871687 %backward: 62.75300202576142 [2025-03-28 10:00:41,996] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16358.87 | forward: 145490.44 | backward_microstep: 390559.67 | backward: 390544.79 | backward_inner_microstep: 390528.56 | backward_inner: 390522.29 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.19 | step: 349.77 | _step_clipping: 0.13 | _step_step: 348.00 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.454 | iteration 4740/ 143000 | elapsed time per iteration (ms): 62235.8 | learning rate: 5.992E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.839906E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 10:11:08,564] [INFO] [logging.py:60:log_dist] [Rank 0] step=4750, skipped=0, lr=[0.000599202368026419, 0.000599202368026419], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4750 loss: 2.8460 iter time (s): 62.656 samples/sec: 16.343 %comms: 0.002852588377624584 %optimizer_step 0.055119537138338615 %forward: 23.200065049943476 %backward: 62.291272706391446 [2025-03-28 10:11:08,565] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21024.13 | forward: 145363.22 | backward_microstep: 390302.80 | backward: 390294.59 | backward_inner_microstep: 390279.17 | backward_inner: 390273.27 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.49 | reduce_tied_grads: 0.45 | comms: 17.87 | reduce_grads: 0.18 | step: 345.36 | _step_clipping: 0.12 | _step_step: 343.67 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.343 | iteration 4750/ 143000 | elapsed time per iteration (ms): 62656.9 | learning rate: 5.992E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.840522E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 10:21:36,601] [INFO] [logging.py:60:log_dist] [Rank 0] step=4760, skipped=0, lr=[0.0005991975579245172, 0.0005991975579245172], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4760 loss: 2.8425 iter time (s): 62.803 samples/sec: 16.305 %comms: 0.0028471017439272435 %optimizer_step 0.05585940960642309 %forward: 23.143009577033446 %backward: 62.14197304552501 [2025-03-28 10:21:36,601] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22570.69 | forward: 145345.25 | backward_microstep: 390278.20 | backward: 390270.78 | backward_inner_microstep: 390255.14 | backward_inner: 390249.31 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 350.81 | _step_clipping: 0.13 | _step_step: 349.15 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.305 | iteration 4760/ 143000 | elapsed time per iteration (ms): 62803.6 | learning rate: 5.992E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.840127E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 10:32:05,185] [INFO] [logging.py:60:log_dist] [Rank 0] step=4770, skipped=0, lr=[0.0005991927333819997, 0.0005991927333819997], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4770 loss: 2.8534 iter time (s): 62.858 samples/sec: 16.291 %comms: 0.0028658597808874677 %optimizer_step 0.057078503562729636 %forward: 23.156716469237352 %backward: 62.09740099251266 [2025-03-28 10:32:05,186] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22798.91 | forward: 145558.26 | backward_microstep: 390338.28 | backward: 390331.22 | backward_inner_microstep: 390315.82 | backward_inner: 390310.03 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.69 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.21 | step: 358.78 | _step_clipping: 0.14 | _step_step: 356.86 | _step_zero_grad: 0.53 | _step_check_overflow: 0.67 samples/sec: 16.291 | iteration 4770/ 143000 | elapsed time per iteration (ms): 62858.5 | learning rate: 5.992E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.840742E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 10:42:32,579] [INFO] [logging.py:60:log_dist] [Rank 0] step=4780, skipped=0, lr=[0.0005991878943990992, 0.0005991878943990992], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4780 loss: 2.8327 iter time (s): 62.739 samples/sec: 16.322 %comms: 0.0028600121591026632 %optimizer_step 0.05579699033874973 %forward: 23.16393189461482 %backward: 62.218712293897624 [2025-03-28 10:42:32,580] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21831.69 | forward: 145327.80 | backward_microstep: 390360.37 | backward: 390352.93 | backward_inner_microstep: 390336.12 | backward_inner: 390330.26 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.49 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.18 | step: 350.06 | _step_clipping: 0.13 | _step_step: 348.35 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.321 | iteration 4780/ 143000 | elapsed time per iteration (ms): 62739.4 | learning rate: 5.992E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.837944E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 10:52:59,977] [INFO] [logging.py:60:log_dist] [Rank 0] step=4790, skipped=0, lr=[0.0005991830409760493, 0.0005991830409760493], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4790 loss: 2.8156 iter time (s): 62.739 samples/sec: 16.322 %comms: 0.0028522789479074684 %optimizer_step 0.055327188911312594 %forward: 23.20287339075992 %backward: 62.20713274344679 [2025-03-28 10:52:59,978] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21659.51 | forward: 145573.08 | backward_microstep: 390291.21 | backward: 390282.87 | backward_inner_microstep: 390267.28 | backward_inner: 390261.39 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.72 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 347.12 | _step_clipping: 0.14 | _step_step: 345.40 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.321 | iteration 4790/ 143000 | elapsed time per iteration (ms): 62739.8 | learning rate: 5.992E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.826653E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 11:03:27,723] [INFO] [logging.py:60:log_dist] [Rank 0] step=4800, skipped=0, lr=[0.0005991781731130843, 0.0005991781731130843], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4800 loss: 2.8594 iter time (s): 62.774 samples/sec: 16.312 %comms: 0.0028727264372957334 %optimizer_step 0.05663981035668148 %forward: 23.191258007496586 %backward: 62.20024399317429 [2025-03-28 11:03:27,724] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21817.07 | forward: 145580.91 | backward_microstep: 390463.92 | backward: 390456.10 | backward_inner_microstep: 390440.47 | backward_inner: 390434.51 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.58 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.20 | step: 355.55 | _step_clipping: 0.14 | _step_step: 353.76 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.312 | iteration 4800/ 143000 | elapsed time per iteration (ms): 62774.6 | learning rate: 5.992E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.840950E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 11:13:54,547] [INFO] [logging.py:60:log_dist] [Rank 0] step=4810, skipped=0, lr=[0.000599173290810439, 0.000599173290810439], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4810 loss: 2.8160 iter time (s): 62.682 samples/sec: 16.336 %comms: 0.0028808726943136127 %optimizer_step 0.05551955933313168 %forward: 23.189446757021408 %backward: 62.264718724821066 [2025-03-28 11:13:54,547] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21301.37 | forward: 145355.58 | backward_microstep: 390294.07 | backward: 390286.35 | backward_inner_microstep: 390271.03 | backward_inner: 390265.10 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.19 | step: 348.01 | _step_clipping: 0.15 | _step_step: 346.29 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.336 | iteration 4810/ 143000 | elapsed time per iteration (ms): 62682.3 | learning rate: 5.992E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.826270E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 11:24:24,527] [INFO] [logging.py:60:log_dist] [Rank 0] step=4820, skipped=0, lr=[0.0005991683940683493, 0.0005991683940683493], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4820 loss: 2.8501 iter time (s): 62.997 samples/sec: 16.255 %comms: 0.002865494432726938 %optimizer_step 0.056067827480561806 %forward: 23.113576372123635 %backward: 61.9848421706387 [2025-03-28 11:24:24,527] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23961.68 | forward: 145609.46 | backward_microstep: 390497.69 | backward: 390488.23 | backward_inner_microstep: 390471.80 | backward_inner: 390465.37 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.18 | step: 353.21 | _step_clipping: 0.15 | _step_step: 351.43 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.254 | iteration 4820/ 143000 | elapsed time per iteration (ms): 62998.0 | learning rate: 5.992E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.832202E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 11:34:54,491] [INFO] [logging.py:60:log_dist] [Rank 0] step=4830, skipped=0, lr=[0.0005991634828870513, 0.0005991634828870513], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4830 loss: 2.8326 iter time (s): 62.996 samples/sec: 16.255 %comms: 0.0028456556662034707 %optimizer_step 0.05557462686606423 %forward: 23.087091411448966 %backward: 61.93382239025665 [2025-03-28 11:34:54,492] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24525.86 | forward: 145439.13 | backward_microstep: 390164.54 | backward: 390157.48 | backward_inner_microstep: 390142.00 | backward_inner: 390136.23 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 350.10 | _step_clipping: 0.12 | _step_step: 348.36 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.255 | iteration 4830/ 143000 | elapsed time per iteration (ms): 62996.4 | learning rate: 5.992E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.838758E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 11:45:15,300] [INFO] [logging.py:60:log_dist] [Rank 0] step=4840, skipped=0, lr=[0.0005991585572667822, 0.0005991585572667822], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4840 loss: 2.8068 iter time (s): 62.080 samples/sec: 16.495 %comms: 0.0036878997429398274 %optimizer_step 0.05795421900702096 %forward: 23.430993765470816 %backward: 62.89374304105845 [2025-03-28 11:45:15,301] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15001.32 | forward: 145460.49 | backward_microstep: 390455.85 | backward: 390446.73 | backward_inner_microstep: 390429.00 | backward_inner: 390422.76 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.56 | reduce_tied_grads: 0.33 | comms: 22.89 | reduce_grads: 0.19 | step: 359.78 | _step_clipping: 0.14 | _step_step: 357.95 | _step_zero_grad: 0.50 | _step_check_overflow: 0.61 samples/sec: 16.495 | iteration 4840/ 143000 | elapsed time per iteration (ms): 62081.0 | learning rate: 5.992E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.827112E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 11:55:45,380] [INFO] [logging.py:60:log_dist] [Rank 0] step=4850, skipped=0, lr=[0.0005991536172077796, 0.0005991536172077796], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4850 loss: 2.8397 iter time (s): 63.007 samples/sec: 16.252 %comms: 0.002845549823365295 %optimizer_step 0.057089146049332545 %forward: 23.071338472909925 %backward: 61.95600036648865 [2025-03-28 11:55:45,381] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24438.84 | forward: 145366.57 | backward_microstep: 390377.05 | backward: 390368.81 | backward_inner_microstep: 390352.80 | backward_inner: 390346.58 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 359.70 | _step_clipping: 0.13 | _step_step: 357.91 | _step_zero_grad: 0.48 | _step_check_overflow: 0.63 samples/sec: 16.252 | iteration 4850/ 143000 | elapsed time per iteration (ms): 63008.0 | learning rate: 5.992E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.831249E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 12:06:12,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=4860, skipped=0, lr=[0.000599148662710282, 0.000599148662710282], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4860 loss: 2.8343 iter time (s): 62.676 samples/sec: 16.338 %comms: 0.0029466754676008558 %optimizer_step 0.05681043315825055 %forward: 23.230155705301854 %backward: 62.296098534806966 [2025-03-28 12:06:12,148] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20774.76 | forward: 145597.60 | backward_microstep: 390457.01 | backward: 390447.77 | backward_inner_microstep: 390428.14 | backward_inner: 390421.76 | backward_allreduce_microstep: 9.40 | backward_allreduce: 2.61 | reduce_tied_grads: 0.31 | comms: 18.47 | reduce_grads: 0.18 | step: 356.07 | _step_clipping: 0.12 | _step_step: 354.34 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.338 | iteration 4860/ 143000 | elapsed time per iteration (ms): 62676.7 | learning rate: 5.991E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.840263E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 12:16:37,131] [INFO] [logging.py:60:log_dist] [Rank 0] step=4870, skipped=0, lr=[0.0005991436937745285, 0.0005991436937745285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4870 loss: 2.8429 iter time (s): 62.498 samples/sec: 16.385 %comms: 0.002956682714470763 %optimizer_step 0.057156848385523124 %forward: 23.29680765271627 %backward: 62.4783905757693 [2025-03-28 12:16:37,132] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18941.96 | forward: 145600.04 | backward_microstep: 390489.37 | backward: 390476.52 | backward_inner_microstep: 390459.54 | backward_inner: 390450.79 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.77 | reduce_tied_grads: 0.33 | comms: 18.48 | reduce_grads: 0.21 | step: 357.22 | _step_clipping: 0.13 | _step_step: 355.04 | _step_zero_grad: 0.57 | _step_check_overflow: 0.82 samples/sec: 16.384 | iteration 4870/ 143000 | elapsed time per iteration (ms): 62498.5 | learning rate: 5.991E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.834814E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 12:27:04,612] [INFO] [logging.py:60:log_dist] [Rank 0] step=4880, skipped=0, lr=[0.000599138710400759, 0.000599138710400759], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4880 loss: 2.8440 iter time (s): 62.747 samples/sec: 16.319 %comms: 0.0029237228265863637 %optimizer_step 0.0575640964760762 %forward: 23.17337689204501 %backward: 62.2044564847939 [2025-03-28 12:27:04,613] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21847.66 | forward: 145406.87 | backward_microstep: 390325.41 | backward: 390316.66 | backward_inner_microstep: 390298.92 | backward_inner: 390292.26 | backward_allreduce_microstep: 9.27 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.35 | reduce_grads: 0.18 | step: 361.20 | _step_clipping: 0.14 | _step_step: 359.29 | _step_zero_grad: 0.51 | _step_check_overflow: 0.66 samples/sec: 16.319 | iteration 4880/ 143000 | elapsed time per iteration (ms): 62748.0 | learning rate: 5.991E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.846220E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 12:37:33,114] [INFO] [logging.py:60:log_dist] [Rank 0] step=4890, skipped=0, lr=[0.0005991337125892139, 0.0005991337125892139], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4890 loss: 2.8289 iter time (s): 62.850 samples/sec: 16.293 %comms: 0.0028839508408834975 %optimizer_step 0.05640008491919649 %forward: 23.12065320014678 %backward: 62.08402738105414 [2025-03-28 12:37:33,115] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23111.53 | forward: 145312.53 | backward_microstep: 390203.33 | backward: 390196.02 | backward_inner_microstep: 390180.64 | backward_inner: 390174.72 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.57 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.20 | step: 354.47 | _step_clipping: 0.14 | _step_step: 352.64 | _step_zero_grad: 0.48 | _step_check_overflow: 0.63 samples/sec: 16.293 | iteration 4890/ 143000 | elapsed time per iteration (ms): 62850.2 | learning rate: 5.991E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.837652E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 12:48:00,630] [INFO] [logging.py:60:log_dist] [Rank 0] step=4900, skipped=0, lr=[0.0005991287003401345, 0.0005991287003401345], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4900 loss: 2.8362 iter time (s): 62.751 samples/sec: 16.318 %comms: 0.002862836771746486 %optimizer_step 0.05585523760484023 %forward: 23.155000675908205 %backward: 62.186370635086455 [2025-03-28 12:48:00,631] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22119.71 | forward: 145300.06 | backward_microstep: 390233.92 | backward: 390226.01 | backward_inner_microstep: 390210.58 | backward_inner: 390204.64 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.52 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.23 | step: 350.50 | _step_clipping: 0.14 | _step_step: 348.83 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.318 | iteration 4900/ 143000 | elapsed time per iteration (ms): 62751.6 | learning rate: 5.991E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.826513E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 12:58:27,224] [INFO] [logging.py:60:log_dist] [Rank 0] step=4910, skipped=0, lr=[0.0005991236736537627, 0.0005991236736537627], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4910 loss: 2.8340 iter time (s): 62.659 samples/sec: 16.342 %comms: 0.002852286256629796 %optimizer_step 0.056793109392974446 %forward: 23.18900720863308 %backward: 62.27181832183453 [2025-03-28 12:58:27,224] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21225.12 | forward: 145299.63 | backward_microstep: 390196.82 | backward: 390188.00 | backward_inner_microstep: 390172.76 | backward_inner: 390166.90 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 355.86 | _step_clipping: 0.11 | _step_step: 354.20 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.342 | iteration 4910/ 143000 | elapsed time per iteration (ms): 62659.4 | learning rate: 5.991E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.823113E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 13:08:51,864] [INFO] [logging.py:60:log_dist] [Rank 0] step=4920, skipped=0, lr=[0.000599118632530341, 0.000599118632530341], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4920 loss: 2.8014 iter time (s): 62.463 samples/sec: 16.394 %comms: 0.002903691097815556 %optimizer_step 0.056095927156541096 %forward: 23.30030733131634 %backward: 62.48256385482345 [2025-03-28 13:08:51,864] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18950.08 | forward: 145541.75 | backward_microstep: 390295.11 | backward: 390287.62 | backward_inner_microstep: 390272.04 | backward_inner: 390266.05 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 18.14 | reduce_grads: 0.18 | step: 350.39 | _step_clipping: 0.12 | _step_step: 348.66 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.393 | iteration 4920/ 143000 | elapsed time per iteration (ms): 62464.0 | learning rate: 5.991E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.819086E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 13:19:20,457] [INFO] [logging.py:60:log_dist] [Rank 0] step=4930, skipped=0, lr=[0.0005991135769701129, 0.0005991135769701129], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4930 loss: 2.8194 iter time (s): 62.859 samples/sec: 16.290 %comms: 0.0028441248038330487 %optimizer_step 0.0552022303060428 %forward: 23.16951579547644 %backward: 62.102370804253006 [2025-03-28 13:19:20,457] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22754.68 | forward: 145640.71 | backward_microstep: 390376.75 | backward: 390367.83 | backward_inner_microstep: 390352.07 | backward_inner: 390346.06 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 346.99 | _step_clipping: 0.13 | _step_step: 345.33 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.290 | iteration 4930/ 143000 | elapsed time per iteration (ms): 62859.3 | learning rate: 5.991E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.822591E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 13:29:46,927] [INFO] [logging.py:60:log_dist] [Rank 0] step=4940, skipped=0, lr=[0.0005991085069733223, 0.0005991085069733223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4940 loss: 2.8201 iter time (s): 62.646 samples/sec: 16.346 %comms: 0.002862287726674447 %optimizer_step 0.055659962872981944 %forward: 23.203332957146113 %backward: 62.30202041621209 [2025-03-28 13:29:46,927] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20975.78 | forward: 145360.70 | backward_microstep: 390307.28 | backward: 390300.18 | backward_inner_microstep: 390281.04 | backward_inner: 390275.18 | backward_allreduce_microstep: 9.20 | backward_allreduce: 2.55 | reduce_tied_grads: 0.31 | comms: 17.93 | reduce_grads: 0.18 | step: 348.69 | _step_clipping: 0.13 | _step_step: 346.82 | _step_zero_grad: 0.65 | _step_check_overflow: 0.55 samples/sec: 16.346 | iteration 4940/ 143000 | elapsed time per iteration (ms): 62647.0 | learning rate: 5.991E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.817256E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 13:40:16,595] [INFO] [logging.py:60:log_dist] [Rank 0] step=4950, skipped=0, lr=[0.0005991034225402138, 0.0005991034225402138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4950 loss: 2.8032 iter time (s): 62.966 samples/sec: 16.263 %comms: 0.0028753163631321244 %optimizer_step 0.05927698476938028 %forward: 23.08842867640427 %backward: 61.984156408958725 [2025-03-28 13:40:16,596] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24145.75 | forward: 145379.20 | backward_microstep: 390298.06 | backward: 390291.05 | backward_inner_microstep: 390275.48 | backward_inner: 390269.56 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.59 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.19 | step: 373.25 | _step_clipping: 0.13 | _step_step: 371.44 | _step_zero_grad: 0.48 | _step_check_overflow: 0.61 samples/sec: 16.263 | iteration 4950/ 143000 | elapsed time per iteration (ms): 62966.8 | learning rate: 5.991E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.819091E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 13:50:41,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=4960, skipped=0, lr=[0.0005990983236710331, 0.0005990983236710331], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4960 loss: 2.8101 iter time (s): 62.478 samples/sec: 16.390 %comms: 0.002868245842189484 %optimizer_step 0.05578702483138074 %forward: 23.285304379959886 %backward: 62.46923143012094 [2025-03-28 13:50:41,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19169.58 | forward: 145482.16 | backward_microstep: 390304.25 | backward: 390295.90 | backward_inner_microstep: 390280.08 | backward_inner: 390273.96 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 348.55 | _step_clipping: 0.14 | _step_step: 346.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.390 | iteration 4960/ 143000 | elapsed time per iteration (ms): 62478.6 | learning rate: 5.991E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.809852E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 14:01:15,078] [INFO] [logging.py:60:log_dist] [Rank 0] step=4970, skipped=0, lr=[0.0005990932103660259, 0.0005990932103660259], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4970 loss: 2.8060 iter time (s): 63.369 samples/sec: 16.159 %comms: 0.002844058284196312 %optimizer_step 0.055536001965082044 %forward: 22.95560108225974 %backward: 61.593204925621805 [2025-03-28 14:01:15,078] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28099.98 | forward: 145467.54 | backward_microstep: 390319.46 | backward: 390310.51 | backward_inner_microstep: 390294.89 | backward_inner: 390288.94 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 18.02 | reduce_grads: 0.19 | step: 351.93 | _step_clipping: 0.14 | _step_step: 350.20 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.159 | iteration 4970/ 143000 | elapsed time per iteration (ms): 63369.6 | learning rate: 5.991E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.820964E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 14:11:45,466] [INFO] [logging.py:60:log_dist] [Rank 0] step=4980, skipped=0, lr=[0.0005990880826254394, 0.0005990880826254394], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4980 loss: 2.8173 iter time (s): 63.038 samples/sec: 16.244 %comms: 0.0029243400893288543 %optimizer_step 0.05893899281228862 %forward: 23.094452749579965 %backward: 61.94494593096951 [2025-03-28 14:11:45,466] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24409.28 | forward: 145583.36 | backward_microstep: 390501.79 | backward: 390490.02 | backward_inner_microstep: 390473.86 | backward_inner: 390467.46 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.59 | reduce_tied_grads: 0.33 | comms: 18.43 | reduce_grads: 0.18 | step: 371.54 | _step_clipping: 0.13 | _step_step: 369.55 | _step_zero_grad: 0.54 | _step_check_overflow: 0.72 samples/sec: 16.244 | iteration 4980/ 143000 | elapsed time per iteration (ms): 63038.8 | learning rate: 5.991E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.820308E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 14:22:13,459] [INFO] [logging.py:60:log_dist] [Rank 0] step=4990, skipped=0, lr=[0.0005990829404495207, 0.0005990829404495207], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 4990 loss: 2.8271 iter time (s): 62.799 samples/sec: 16.306 %comms: 0.0029400132656979454 %optimizer_step 0.05723338961194029 %forward: 23.212776845179448 %backward: 62.20472626318345 [2025-03-28 14:22:13,460] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21661.35 | forward: 145773.18 | backward_microstep: 390650.96 | backward: 390637.49 | backward_inner_microstep: 390619.59 | backward_inner: 390612.18 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.96 | reduce_tied_grads: 0.34 | comms: 18.46 | reduce_grads: 0.20 | step: 359.42 | _step_clipping: 0.14 | _step_step: 357.35 | _step_zero_grad: 0.58 | _step_check_overflow: 0.66 samples/sec: 16.306 | iteration 4990/ 143000 | elapsed time per iteration (ms): 62799.3 | learning rate: 5.991E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.822180E+00 | loss scale: 65536.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 14:32:40,610] [INFO] [logging.py:60:log_dist] [Rank 0] step=5000, skipped=0, lr=[0.0005990777838385182, 0.0005990777838385182], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5000 loss: 2.8120 iter time (s): 62.715 samples/sec: 16.328 %comms: 0.0028933963201737776 %optimizer_step 0.05563175899950307 %forward: 23.16986708459086 %backward: 62.220646955281666 [2025-03-28 14:32:40,611] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21811.25 | forward: 145308.74 | backward_microstep: 390220.50 | backward: 390213.88 | backward_inner_microstep: 390198.92 | backward_inner: 390193.21 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.18 | step: 348.89 | _step_clipping: 0.14 | _step_step: 347.13 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.328 | iteration 5000/ 143000 | elapsed time per iteration (ms): 62715.1 | learning rate: 5.991E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.815513E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 14:32:43,509] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step5000/mp_rank_00_model_states.pt [2025-03-28 14:32:57,282] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-28 14:32:57,287] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-28 14:43:21,271] [INFO] [logging.py:60:log_dist] [Rank 0] step=5010, skipped=0, lr=[0.0005990726127926808, 0.0005990726127926808], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5010 loss: 2.8247 iter time (s): 62.397 samples/sec: 16.411 %comms: 0.0028887077443452694 %optimizer_step 0.05825567622913948 %forward: 23.308284923243978 %backward: 62.580764200704394 [2025-03-28 14:43:21,271] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18156.88 | forward: 145436.81 | backward_microstep: 390494.43 | backward: 390485.48 | backward_inner_microstep: 390465.56 | backward_inner: 390459.45 | backward_allreduce_microstep: 9.51 | backward_allreduce: 2.54 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.22 | step: 363.50 | _step_clipping: 0.13 | _step_step: 361.61 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 15.984 | iteration 5010/ 143000 | elapsed time per iteration (ms): 64066.1 | learning rate: 5.991E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.828350E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 14:53:50,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=5020, skipped=0, lr=[0.0005990674273122581, 0.0005990674273122581], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5020 loss: 2.8218 iter time (s): 62.957 samples/sec: 16.265 %comms: 0.00285295321275227 %optimizer_step 0.05505671411007874 %forward: 23.14580044717136 %backward: 62.000709485742846 [2025-03-28 14:53:50,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23651.64 | forward: 145718.42 | backward_microstep: 390344.67 | backward: 390336.26 | backward_inner_microstep: 390318.63 | backward_inner: 390312.53 | backward_allreduce_microstep: 9.29 | backward_allreduce: 2.63 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.19 | step: 346.62 | _step_clipping: 0.12 | _step_step: 344.91 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.265 | iteration 5020/ 143000 | elapsed time per iteration (ms): 62957.3 | learning rate: 5.991E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.819045E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 15:04:16,318] [INFO] [logging.py:60:log_dist] [Rank 0] step=5030, skipped=0, lr=[0.0005990622273975002, 0.0005990622273975002], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5030 loss: 2.8200 iter time (s): 62.547 samples/sec: 16.372 %comms: 0.002867758170884199 %optimizer_step 0.05536206901122641 %forward: 23.256302371057526 %backward: 62.404850359356686 [2025-03-28 15:04:16,319] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19807.46 | forward: 145461.02 | backward_microstep: 390331.64 | backward: 390323.14 | backward_inner_microstep: 390307.20 | backward_inner: 390301.15 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.59 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.20 | step: 346.27 | _step_clipping: 0.15 | _step_step: 344.29 | _step_zero_grad: 0.46 | _step_check_overflow: 0.80 samples/sec: 16.372 | iteration 5030/ 143000 | elapsed time per iteration (ms): 62547.4 | learning rate: 5.991E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.807333E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 15:14:42,463] [INFO] [logging.py:60:log_dist] [Rank 0] step=5040, skipped=0, lr=[0.0005990570130486581, 0.0005990570130486581], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5040 loss: 2.8168 iter time (s): 62.614 samples/sec: 16.354 %comms: 0.0028813659409830158 %optimizer_step 0.05596595743457185 %forward: 23.28665590190849 %backward: 62.361017144772376 [2025-03-28 15:14:42,464] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19968.21 | forward: 145806.97 | backward_microstep: 390477.15 | backward: 390467.00 | backward_inner_microstep: 390447.63 | backward_inner: 390441.41 | backward_allreduce_microstep: 9.21 | backward_allreduce: 2.61 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.20 | step: 350.43 | _step_clipping: 0.13 | _step_step: 348.68 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.354 | iteration 5040/ 143000 | elapsed time per iteration (ms): 62614.5 | learning rate: 5.991E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.818274E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 15:25:09,385] [INFO] [logging.py:60:log_dist] [Rank 0] step=5050, skipped=0, lr=[0.0005990517842659836, 0.0005990517842659836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5050 loss: 2.8271 iter time (s): 62.692 samples/sec: 16.334 %comms: 0.002859048380513745 %optimizer_step 0.05602769614602898 %forward: 23.187097922195964 %backward: 62.257535679685326 [2025-03-28 15:25:09,385] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21380.88 | forward: 145363.63 | backward_microstep: 390309.94 | backward: 390302.46 | backward_inner_microstep: 390287.34 | backward_inner: 390281.49 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 351.25 | _step_clipping: 0.17 | _step_step: 349.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.334 | iteration 5050/ 143000 | elapsed time per iteration (ms): 62692.1 | learning rate: 5.991E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.816122E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 15:35:36,722] [INFO] [logging.py:60:log_dist] [Rank 0] step=5060, skipped=0, lr=[0.000599046541049729, 0.000599046541049729], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5060 loss: 2.8306 iter time (s): 62.733 samples/sec: 16.323 %comms: 0.0028916995520387716 %optimizer_step 0.05569597206250723 %forward: 23.166771625361246 %backward: 62.226551449684365 [2025-03-28 15:35:36,722] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21733.09 | forward: 145332.55 | backward_microstep: 390376.22 | backward: 390367.02 | backward_inner_microstep: 390347.93 | backward_inner: 390341.94 | backward_allreduce_microstep: 10.93 | backward_allreduce: 2.53 | reduce_tied_grads: 0.27 | comms: 18.14 | reduce_grads: 0.18 | step: 349.40 | _step_clipping: 0.14 | _step_step: 347.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.66 samples/sec: 16.323 | iteration 5060/ 143000 | elapsed time per iteration (ms): 62733.7 | learning rate: 5.990E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.812104E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 15:46:04,473] [INFO] [logging.py:60:log_dist] [Rank 0] step=5070, skipped=0, lr=[0.0005990412834001475, 0.0005990412834001475], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5070 loss: 2.7959 iter time (s): 62.775 samples/sec: 16.312 %comms: 0.002876197573968574 %optimizer_step 0.05636084785321487 %forward: 23.16193398766858 %backward: 62.170762926737176 [2025-03-28 15:46:04,474] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22170.25 | forward: 145398.00 | backward_microstep: 390283.22 | backward: 390274.16 | backward_inner_microstep: 390258.71 | backward_inner: 390252.81 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 18.06 | reduce_grads: 0.18 | step: 353.80 | _step_clipping: 0.13 | _step_step: 352.02 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.312 | iteration 5070/ 143000 | elapsed time per iteration (ms): 62775.1 | learning rate: 5.990E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.805486E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 15:56:33,528] [INFO] [logging.py:60:log_dist] [Rank 0] step=5080, skipped=0, lr=[0.0005990360113174926, 0.0005990360113174926], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5080 loss: 2.8380 iter time (s): 62.905 samples/sec: 16.279 %comms: 0.0028420033416939627 %optimizer_step 0.055431383138681095 %forward: 23.16760781831132 %backward: 62.07882103349739 [2025-03-28 15:56:33,528] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22880.74 | forward: 145735.48 | backward_microstep: 390517.48 | backward: 390505.88 | backward_inner_microstep: 390489.85 | backward_inner: 390483.62 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 348.69 | _step_clipping: 0.11 | _step_step: 347.04 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.278 | iteration 5080/ 143000 | elapsed time per iteration (ms): 62905.4 | learning rate: 5.990E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.807701E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 16:06:57,790] [INFO] [logging.py:60:log_dist] [Rank 0] step=5090, skipped=0, lr=[0.0005990307248020188, 0.0005990307248020188], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5090 loss: 2.8176 iter time (s): 62.426 samples/sec: 16.403 %comms: 0.002871034175276419 %optimizer_step 0.05524799660779748 %forward: 23.27690829660955 %backward: 62.517223311980494 [2025-03-28 16:06:57,791] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18774.73 | forward: 145307.79 | backward_microstep: 390276.14 | backward: 390268.32 | backward_inner_microstep: 390248.95 | backward_inner: 390240.98 | backward_allreduce_microstep: 11.26 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 344.89 | _step_clipping: 0.12 | _step_step: 343.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.403 | iteration 5090/ 143000 | elapsed time per iteration (ms): 62426.3 | learning rate: 5.990E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.823460E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 16:17:19,881] [INFO] [logging.py:60:log_dist] [Rank 0] step=5100, skipped=0, lr=[0.0005990254238539814, 0.0005990254238539814], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5100 loss: 2.8221 iter time (s): 62.209 samples/sec: 16.461 %comms: 0.002879254363583746 %optimizer_step 0.05540347509828251 %forward: 23.34734815102478 %backward: 62.721857723521666 [2025-03-28 16:17:19,882] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16786.69 | forward: 145240.55 | backward_microstep: 390190.85 | backward: 390183.81 | backward_inner_microstep: 390168.55 | backward_inner: 390162.86 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.68 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 344.66 | _step_clipping: 0.13 | _step_step: 342.99 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.461 | iteration 5100/ 143000 | elapsed time per iteration (ms): 62209.1 | learning rate: 5.990E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.813500E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 16:27:41,455] [INFO] [logging.py:60:log_dist] [Rank 0] step=5110, skipped=0, lr=[0.0005990201084736361, 0.0005990201084736361], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5110 loss: 2.8122 iter time (s): 62.157 samples/sec: 16.474 %comms: 0.002868764223037856 %optimizer_step 0.05553603797734135 %forward: 23.36559557571702 %backward: 62.76436491798197 [2025-03-28 16:27:41,455] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16344.65 | forward: 145233.11 | backward_microstep: 390129.16 | backward: 390123.32 | backward_inner_microstep: 390108.59 | backward_inner: 390103.12 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.46 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 345.19 | _step_clipping: 0.11 | _step_step: 343.61 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.474 | iteration 5110/ 143000 | elapsed time per iteration (ms): 62157.3 | learning rate: 5.990E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.802518E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 16:38:13,073] [INFO] [logging.py:60:log_dist] [Rank 0] step=5120, skipped=0, lr=[0.0005990147786612397, 0.0005990147786612397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5120 loss: 2.8334 iter time (s): 63.161 samples/sec: 16.212 %comms: 0.002842693012564923 %optimizer_step 0.05524256803885959 %forward: 23.02036918489157 %backward: 61.7944478352194 [2025-03-28 16:38:13,074] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26034.83 | forward: 145399.71 | backward_microstep: 390309.76 | backward: 390301.93 | backward_inner_microstep: 390283.06 | backward_inner: 390275.49 | backward_allreduce_microstep: 8.98 | backward_allreduce: 4.23 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.19 | step: 348.92 | _step_clipping: 0.14 | _step_step: 347.24 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.212 | iteration 5120/ 143000 | elapsed time per iteration (ms): 63161.9 | learning rate: 5.990E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.810328E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 16:48:37,301] [INFO] [logging.py:60:log_dist] [Rank 0] step=5130, skipped=0, lr=[0.0005990094344170492, 0.0005990094344170492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5130 loss: 2.7967 iter time (s): 62.422 samples/sec: 16.404 %comms: 0.002889719749117402 %optimizer_step 0.055660429874851966 %forward: 23.282111882664992 %backward: 62.530733995038155 [2025-03-28 16:48:37,302] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18658.64 | forward: 145332.12 | backward_microstep: 390339.56 | backward: 390330.74 | backward_inner_microstep: 390315.39 | backward_inner: 390309.51 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 18.04 | reduce_grads: 0.18 | step: 347.44 | _step_clipping: 0.15 | _step_step: 345.71 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.404 | iteration 5130/ 143000 | elapsed time per iteration (ms): 62422.8 | learning rate: 5.990E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.800871E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 16:59:01,122] [INFO] [logging.py:60:log_dist] [Rank 0] step=5140, skipped=0, lr=[0.0005990040757413225, 0.0005990040757413225], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5140 loss: 2.8175 iter time (s): 62.381 samples/sec: 16.415 %comms: 0.0028969047448034226 %optimizer_step 0.05609903660129761 %forward: 23.305980907738995 %backward: 62.5852579980247 [2025-03-28 16:59:01,123] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18077.41 | forward: 145384.98 | backward_microstep: 390420.73 | backward: 390412.95 | backward_inner_microstep: 390397.52 | backward_inner: 390391.54 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.56 | reduce_tied_grads: 0.33 | comms: 18.07 | reduce_grads: 0.20 | step: 349.95 | _step_clipping: 0.13 | _step_step: 348.01 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.415 | iteration 5140/ 143000 | elapsed time per iteration (ms): 62382.1 | learning rate: 5.990E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.817957E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 17:09:26,803] [INFO] [logging.py:60:log_dist] [Rank 0] step=5150, skipped=0, lr=[0.0005989987026343183, 0.0005989987026343183], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5150 loss: 2.8037 iter time (s): 62.568 samples/sec: 16.366 %comms: 0.002908349069231928 %optimizer_step 0.05630642753253215 %forward: 23.230752811038094 %backward: 62.39042708286235 [2025-03-28 17:09:26,804] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20017.18 | forward: 145349.08 | backward_microstep: 390370.10 | backward: 390361.48 | backward_inner_microstep: 390344.13 | backward_inner: 390338.00 | backward_allreduce_microstep: 9.14 | backward_allreduce: 4.30 | reduce_tied_grads: 0.30 | comms: 18.20 | reduce_grads: 0.18 | step: 352.30 | _step_clipping: 0.14 | _step_step: 350.51 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.366 | iteration 5150/ 143000 | elapsed time per iteration (ms): 62568.1 | learning rate: 5.990E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.809896E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 17:19:46,184] [INFO] [logging.py:60:log_dist] [Rank 0] step=5160, skipped=0, lr=[0.0005989933150962961, 0.0005989933150962961], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5160 loss: 2.7716 iter time (s): 61.937 samples/sec: 16.533 %comms: 0.002925116242393042 %optimizer_step 0.0569580836543343 %forward: 23.462148938275746 %backward: 63.02550923432002 [2025-03-28 17:19:46,184] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13751.94 | forward: 145318.60 | backward_microstep: 390375.12 | backward: 390364.01 | backward_inner_microstep: 390348.19 | backward_inner: 390341.95 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.56 | reduce_tied_grads: 0.32 | comms: 18.12 | reduce_grads: 0.18 | step: 352.78 | _step_clipping: 0.13 | _step_step: 351.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.533 | iteration 5160/ 143000 | elapsed time per iteration (ms): 61938.0 | learning rate: 5.990E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.792583E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 17:30:13,852] [INFO] [logging.py:60:log_dist] [Rank 0] step=5170, skipped=0, lr=[0.0005989879131275157, 0.0005989879131275157], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5170 loss: 2.8102 iter time (s): 62.766 samples/sec: 16.314 %comms: 0.002885200395075539 %optimizer_step 0.055534561772647366 %forward: 23.16359597786154 %backward: 62.165994053216664 [2025-03-28 17:30:13,852] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22190.91 | forward: 145389.21 | backward_microstep: 390199.90 | backward: 390192.65 | backward_inner_microstep: 390177.29 | backward_inner: 390171.42 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 18.11 | reduce_grads: 0.37 | step: 348.57 | _step_clipping: 0.13 | _step_step: 346.90 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.314 | iteration 5170/ 143000 | elapsed time per iteration (ms): 62766.8 | learning rate: 5.990E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.790158E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 17:40:32,695] [INFO] [logging.py:60:log_dist] [Rank 0] step=5180, skipped=0, lr=[0.0005989824967282379, 0.0005989824967282379], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5180 loss: 2.7975 iter time (s): 61.884 samples/sec: 16.547 %comms: 0.0028808821278952066 %optimizer_step 0.05569080028414754 %forward: 23.46097593828 %backward: 63.04437820654387 [2025-03-28 17:40:32,696] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13671.95 | forward: 145185.39 | backward_microstep: 390152.23 | backward: 390142.45 | backward_inner_microstep: 390127.22 | backward_inner: 390121.69 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.49 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.17 | step: 344.64 | _step_clipping: 0.11 | _step_step: 343.03 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.547 | iteration 5180/ 143000 | elapsed time per iteration (ms): 61884.4 | learning rate: 5.990E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.814528E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 17:50:56,406] [INFO] [logging.py:60:log_dist] [Rank 0] step=5190, skipped=0, lr=[0.0005989770658987241, 0.0005989770658987241], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5190 loss: 2.7997 iter time (s): 62.370 samples/sec: 16.418 %comms: 0.00286952484409396 %optimizer_step 0.05566018872958974 %forward: 23.298295143069396 %backward: 62.55748219193723 [2025-03-28 17:50:56,406] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18333.00 | forward: 145312.61 | backward_microstep: 390182.63 | backward: 390174.09 | backward_inner_microstep: 390159.14 | backward_inner: 390153.54 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.18 | step: 347.16 | _step_clipping: 0.11 | _step_step: 345.57 | _step_zero_grad: 0.46 | _step_check_overflow: 0.45 samples/sec: 16.418 | iteration 5190/ 143000 | elapsed time per iteration (ms): 62371.0 | learning rate: 5.990E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.814757E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 18:01:19,020] [INFO] [logging.py:60:log_dist] [Rank 0] step=5200, skipped=0, lr=[0.0005989716206392365, 0.0005989716206392365], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5200 loss: 2.7989 iter time (s): 62.261 samples/sec: 16.447 %comms: 0.0029018437654492885 %optimizer_step 0.058850497479831274 %forward: 23.332186778466177 %backward: 62.67562870983655 [2025-03-28 18:01:19,020] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17191.92 | forward: 145268.14 | backward_microstep: 390231.37 | backward: 390223.70 | backward_inner_microstep: 390208.28 | backward_inner: 390202.40 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 18.07 | reduce_grads: 0.18 | step: 366.41 | _step_clipping: 0.14 | _step_step: 364.63 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.447 | iteration 5200/ 143000 | elapsed time per iteration (ms): 62261.4 | learning rate: 5.990E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.791128E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 18:11:40,962] [INFO] [logging.py:60:log_dist] [Rank 0] step=5210, skipped=0, lr=[0.000598966160950038, 0.000598966160950038], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5210 loss: 2.7796 iter time (s): 62.194 samples/sec: 16.465 %comms: 0.003036232401694653 %optimizer_step 0.05739247085559799 %forward: 23.347593777407166 %backward: 62.72893503012459 [2025-03-28 18:11:40,963] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16687.65 | forward: 145207.43 | backward_microstep: 390143.71 | backward: 390134.75 | backward_inner_microstep: 390119.71 | backward_inner: 390113.83 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.46 | reduce_tied_grads: 0.29 | comms: 18.88 | reduce_grads: 0.19 | step: 356.95 | _step_clipping: 0.16 | _step_step: 354.73 | _step_zero_grad: 0.68 | _step_check_overflow: 0.81 samples/sec: 16.465 | iteration 5210/ 143000 | elapsed time per iteration (ms): 62194.3 | learning rate: 5.990E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.792545E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 18:22:04,472] [INFO] [logging.py:60:log_dist] [Rank 0] step=5220, skipped=0, lr=[0.0005989606868313917, 0.0005989606868313917], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5220 loss: 2.7783 iter time (s): 62.350 samples/sec: 16.423 %comms: 0.0028701080150685832 %optimizer_step 0.056330831245732335 %forward: 23.296841852566814 %backward: 62.55413129331255 [2025-03-28 18:22:04,472] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18383.19 | forward: 145256.61 | backward_microstep: 390033.80 | backward: 390027.15 | backward_inner_microstep: 390012.16 | backward_inner: 390006.61 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.18 | step: 351.22 | _step_clipping: 0.16 | _step_step: 349.54 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.423 | iteration 5220/ 143000 | elapsed time per iteration (ms): 62350.9 | learning rate: 5.990E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.788216E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 18:32:24,039] [INFO] [logging.py:60:log_dist] [Rank 0] step=5230, skipped=0, lr=[0.0005989551982835623, 0.0005989551982835623], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5230 loss: 2.7908 iter time (s): 61.956 samples/sec: 16.528 %comms: 0.0029283914964490496 %optimizer_step 0.056607718634819136 %forward: 23.44718736272287 %backward: 62.964872332380395 [2025-03-28 18:32:24,039] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14324.08 | forward: 145269.67 | backward_microstep: 390112.40 | backward: 390105.91 | backward_inner_microstep: 390090.92 | backward_inner: 390085.14 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.20 | step: 350.72 | _step_clipping: 0.11 | _step_step: 349.01 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.528 | iteration 5230/ 143000 | elapsed time per iteration (ms): 61956.7 | learning rate: 5.990E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.803146E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 18:42:46,901] [INFO] [logging.py:60:log_dist] [Rank 0] step=5240, skipped=0, lr=[0.0005989496953068143, 0.0005989496953068143], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5240 loss: 2.7927 iter time (s): 62.286 samples/sec: 16.440 %comms: 0.0028739683330351734 %optimizer_step 0.05583845518508977 %forward: 23.314170611057516 %backward: 62.62895124247784 [2025-03-28 18:42:46,901] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17727.60 | forward: 145213.88 | backward_microstep: 390095.14 | backward: 390088.64 | backward_inner_microstep: 390073.51 | backward_inner: 390066.09 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.44 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 347.79 | _step_clipping: 0.12 | _step_step: 346.16 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.440 | iteration 5240/ 143000 | elapsed time per iteration (ms): 62286.2 | learning rate: 5.989E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.802151E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 18:53:10,658] [INFO] [logging.py:60:log_dist] [Rank 0] step=5250, skipped=0, lr=[0.0005989441779014136, 0.0005989441779014136], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5250 loss: 2.8002 iter time (s): 62.375 samples/sec: 16.417 %comms: 0.002933716291353941 %optimizer_step 0.05544705443465131 %forward: 23.29627041810322 %backward: 62.56339578981722 [2025-03-28 18:53:10,659] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18362.12 | forward: 145310.86 | backward_microstep: 390247.75 | backward: 390240.19 | backward_inner_microstep: 390224.79 | backward_inner: 390218.83 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.56 | reduce_tied_grads: 0.46 | comms: 18.30 | reduce_grads: 0.24 | step: 345.85 | _step_clipping: 0.11 | _step_step: 344.17 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.417 | iteration 5250/ 143000 | elapsed time per iteration (ms): 62375.8 | learning rate: 5.989E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.790781E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 19:03:38,266] [INFO] [logging.py:60:log_dist] [Rank 0] step=5260, skipped=0, lr=[0.0005989386460676263, 0.0005989386460676263], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5260 loss: 2.7727 iter time (s): 62.760 samples/sec: 16.316 %comms: 0.0028763269891732106 %optimizer_step 0.05721199197298605 %forward: 23.163306651886224 %backward: 62.18723978410311 [2025-03-28 19:03:38,267] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22018.71 | forward: 145373.19 | backward_microstep: 390295.73 | backward: 390287.86 | backward_inner_microstep: 390272.43 | backward_inner: 390266.18 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.52 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.18 | step: 359.06 | _step_clipping: 0.12 | _step_step: 357.41 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.316 | iteration 5260/ 143000 | elapsed time per iteration (ms): 62760.7 | learning rate: 5.989E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.791457E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 19:13:57,626] [INFO] [logging.py:60:log_dist] [Rank 0] step=5270, skipped=0, lr=[0.0005989330998057194, 0.0005989330998057194], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5270 loss: 2.7787 iter time (s): 61.935 samples/sec: 16.533 %comms: 0.00289907587072603 %optimizer_step 0.057379152033299775 %forward: 23.473453819626098 %backward: 63.005652468472974 [2025-03-28 19:13:57,626] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13881.07 | forward: 145383.77 | backward_microstep: 390235.06 | backward: 390228.01 | backward_inner_microstep: 390212.51 | backward_inner: 390206.71 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.18 | step: 355.38 | _step_clipping: 0.13 | _step_step: 353.68 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.533 | iteration 5270/ 143000 | elapsed time per iteration (ms): 61935.9 | learning rate: 5.989E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.784308E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 19:24:23,815] [INFO] [logging.py:60:log_dist] [Rank 0] step=5280, skipped=0, lr=[0.0005989275391159608, 0.0005989275391159608], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5280 loss: 2.7720 iter time (s): 62.618 samples/sec: 16.353 %comms: 0.0029040438727925494 %optimizer_step 0.05522438905291786 %forward: 23.19898573724344 %backward: 62.308334237122054 [2025-03-28 19:24:23,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20902.67 | forward: 145268.37 | backward_microstep: 390171.30 | backward: 390164.90 | backward_inner_microstep: 390150.00 | backward_inner: 390144.34 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 18.18 | reduce_grads: 0.18 | step: 345.81 | _step_clipping: 0.12 | _step_step: 344.17 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.353 | iteration 5280/ 143000 | elapsed time per iteration (ms): 62618.9 | learning rate: 5.989E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.793394E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 19:34:43,581] [INFO] [logging.py:60:log_dist] [Rank 0] step=5290, skipped=0, lr=[0.0005989219639986186, 0.0005989219639986186], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5290 loss: 2.7739 iter time (s): 61.976 samples/sec: 16.523 %comms: 0.0028916339168041116 %optimizer_step 0.05716269421678706 %forward: 23.45977661071429 %backward: 63.00036021379185 [2025-03-28 19:34:43,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13984.44 | forward: 145394.47 | backward_microstep: 390461.60 | backward: 390451.45 | backward_inner_microstep: 390435.96 | backward_inner: 390429.78 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 354.27 | _step_clipping: 0.13 | _step_step: 352.58 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.522 | iteration 5290/ 143000 | elapsed time per iteration (ms): 61976.6 | learning rate: 5.989E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.777486E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 19:45:10,214] [INFO] [logging.py:60:log_dist] [Rank 0] step=5300, skipped=0, lr=[0.0005989163744539621, 0.0005989163744539621], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5300 loss: 2.7909 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.002857705290609273 %optimizer_step 0.05614690052842063 %forward: 23.192280934854008 %backward: 62.27439595646314 [2025-03-28 19:45:10,214] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21226.95 | forward: 145329.01 | backward_microstep: 390236.97 | backward: 390227.95 | backward_inner_microstep: 390210.76 | backward_inner: 390204.93 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.53 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.19 | step: 351.83 | _step_clipping: 0.13 | _step_step: 350.07 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.341 | iteration 5300/ 143000 | elapsed time per iteration (ms): 62663.2 | learning rate: 5.989E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.787990E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 19:55:35,933] [INFO] [logging.py:60:log_dist] [Rank 0] step=5310, skipped=0, lr=[0.0005989107704822609, 0.0005989107704822609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5310 loss: 2.7815 iter time (s): 62.571 samples/sec: 16.365 %comms: 0.002962657346218976 %optimizer_step 0.056435015954174446 %forward: 23.216408138224374 %backward: 62.36935209502731 [2025-03-28 19:55:35,934] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20304.70 | forward: 145268.30 | backward_microstep: 390263.48 | backward: 390253.73 | backward_inner_microstep: 390238.50 | backward_inner: 390232.65 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.31 | comms: 18.54 | reduce_grads: 0.38 | step: 353.12 | _step_clipping: 0.13 | _step_step: 351.37 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.365 | iteration 5310/ 143000 | elapsed time per iteration (ms): 62571.9 | learning rate: 5.989E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.784342E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 20:05:59,045] [INFO] [logging.py:60:log_dist] [Rank 0] step=5320, skipped=0, lr=[0.0005989051520837856, 0.0005989051520837856], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5320 loss: 2.7655 iter time (s): 62.311 samples/sec: 16.434 %comms: 0.0028764523824149897 %optimizer_step 0.05690696928774187 %forward: 23.315042849517376 %backward: 62.636968268716444 [2025-03-28 20:05:59,046] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17650.03 | forward: 145277.49 | backward_microstep: 390302.20 | backward: 390294.88 | backward_inner_microstep: 390279.77 | backward_inner: 390274.03 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.27 | comms: 17.92 | reduce_grads: 0.18 | step: 354.59 | _step_clipping: 0.12 | _step_step: 352.90 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.434 | iteration 5320/ 143000 | elapsed time per iteration (ms): 62311.2 | learning rate: 5.989E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.779160E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 20:16:21,416] [INFO] [logging.py:60:log_dist] [Rank 0] step=5330, skipped=0, lr=[0.0005988995192588074, 0.0005988995192588074], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5330 loss: 2.7821 iter time (s): 62.237 samples/sec: 16.453 %comms: 0.0028666965498227105 %optimizer_step 0.05638257940731726 %forward: 23.33130861652362 %backward: 62.69834291474007 [2025-03-28 20:16:21,417] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17061.84 | forward: 145206.09 | backward_microstep: 390219.16 | backward: 390213.05 | backward_inner_microstep: 390198.52 | backward_inner: 390193.13 | backward_allreduce_microstep: 7.03 | backward_allreduce: 2.41 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.18 | step: 350.91 | _step_clipping: 0.12 | _step_step: 349.06 | _step_zero_grad: 0.45 | _step_check_overflow: 0.75 samples/sec: 16.453 | iteration 5330/ 143000 | elapsed time per iteration (ms): 62237.1 | learning rate: 5.989E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.780421E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 20:26:50,603] [INFO] [logging.py:60:log_dist] [Rank 0] step=5340, skipped=0, lr=[0.000598893872007598, 0.000598893872007598], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5340 loss: 2.7614 iter time (s): 62.918 samples/sec: 16.275 %comms: 0.002847959639572532 %optimizer_step 0.05531783707224838 %forward: 23.096160930739146 %backward: 62.030072672042856 [2025-03-28 20:26:50,604] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23670.69 | forward: 145316.69 | backward_microstep: 390288.25 | backward: 390281.52 | backward_inner_microstep: 390266.29 | backward_inner: 390260.52 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.50 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 348.05 | _step_clipping: 0.11 | _step_step: 346.45 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.275 | iteration 5340/ 143000 | elapsed time per iteration (ms): 62918.7 | learning rate: 5.989E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.769867E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 20:37:13,882] [INFO] [logging.py:60:log_dist] [Rank 0] step=5350, skipped=0, lr=[0.00059888821033043, 0.00059888821033043], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5350 loss: 2.7956 iter time (s): 62.327 samples/sec: 16.429 %comms: 0.002889069479894305 %optimizer_step 0.05636523832262768 %forward: 23.319274295599016 %backward: 62.626147693284295 [2025-03-28 20:37:13,883] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17694.82 | forward: 145342.83 | backward_microstep: 390339.62 | backward: 390332.12 | backward_inner_microstep: 390317.15 | backward_inner: 390311.40 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.44 | reduce_tied_grads: 0.28 | comms: 18.01 | reduce_grads: 0.18 | step: 351.31 | _step_clipping: 0.12 | _step_step: 349.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.429 | iteration 5350/ 143000 | elapsed time per iteration (ms): 62327.9 | learning rate: 5.989E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.782965E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 20:47:41,705] [INFO] [logging.py:60:log_dist] [Rank 0] step=5360, skipped=0, lr=[0.0005988825342275769, 0.0005988825342275769], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5360 loss: 2.8063 iter time (s): 62.782 samples/sec: 16.310 %comms: 0.0028755277033380225 %optimizer_step 0.05577035093049841 %forward: 23.157130736465913 %backward: 62.175301185010326 [2025-03-28 20:47:41,705] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22184.30 | forward: 145384.43 | backward_microstep: 390360.10 | backward: 390347.18 | backward_inner_microstep: 390330.04 | backward_inner: 390324.03 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 18.05 | reduce_grads: 0.18 | step: 350.14 | _step_clipping: 0.13 | _step_step: 348.44 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.310 | iteration 5360/ 143000 | elapsed time per iteration (ms): 62782.3 | learning rate: 5.989E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.783388E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 20:58:10,233] [INFO] [logging.py:60:log_dist] [Rank 0] step=5370, skipped=0, lr=[0.0005988768436993124, 0.0005988768436993124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5370 loss: 2.7972 iter time (s): 62.852 samples/sec: 16.292 %comms: 0.0028684304615152123 %optimizer_step 0.0556406395297911 %forward: 23.139450398876125 %backward: 62.11691068593282 [2025-03-28 20:58:10,233] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22754.17 | forward: 145436.69 | backward_microstep: 390431.38 | backward: 390418.87 | backward_inner_microstep: 390399.68 | backward_inner: 390393.49 | backward_allreduce_microstep: 9.20 | backward_allreduce: 4.33 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.19 | step: 349.71 | _step_clipping: 0.14 | _step_step: 348.02 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.292 | iteration 5370/ 143000 | elapsed time per iteration (ms): 62852.8 | learning rate: 5.989E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.792611E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 21:08:36,286] [INFO] [logging.py:60:log_dist] [Rank 0] step=5380, skipped=0, lr=[0.0005988711387459111, 0.0005988711387459111], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5380 loss: 2.7809 iter time (s): 62.605 samples/sec: 16.357 %comms: 0.0028557768472573285 %optimizer_step 0.055764538336616494 %forward: 23.211415884087884 %backward: 62.338603246376124 [2025-03-28 21:08:36,287] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20561.86 | forward: 145314.59 | backward_microstep: 390276.19 | backward: 390269.55 | backward_inner_microstep: 390252.47 | backward_inner: 390246.68 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.53 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 349.11 | _step_clipping: 0.13 | _step_step: 347.53 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.356 | iteration 5380/ 143000 | elapsed time per iteration (ms): 62605.3 | learning rate: 5.989E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.778331E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 21:18:59,746] [INFO] [logging.py:60:log_dist] [Rank 0] step=5390, skipped=0, lr=[0.0005988654193676485, 0.0005988654193676485], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5390 loss: 2.7949 iter time (s): 62.345 samples/sec: 16.425 %comms: 0.0028937774070479077 %optimizer_step 0.0596223845535408 %forward: 23.322310846176272 %backward: 62.62167563715929 [2025-03-28 21:18:59,747] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17698.84 | forward: 145403.89 | backward_microstep: 390426.09 | backward: 390417.37 | backward_inner_microstep: 390402.11 | backward_inner: 390396.17 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 18.04 | reduce_grads: 0.19 | step: 371.72 | _step_clipping: 0.13 | _step_step: 369.89 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.424 | iteration 5390/ 143000 | elapsed time per iteration (ms): 62346.0 | learning rate: 5.989E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.785610E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 21:29:28,707] [INFO] [logging.py:60:log_dist] [Rank 0] step=5400, skipped=0, lr=[0.0005988596855648008, 0.0005988596855648008], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5400 loss: 2.7642 iter time (s): 62.896 samples/sec: 16.281 %comms: 0.002872482475343663 %optimizer_step 0.05590724957761691 %forward: 23.10736459821848 %backward: 62.05752333322931 [2025-03-28 21:29:28,708] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23413.06 | forward: 145335.11 | backward_microstep: 390322.19 | backward: 390314.38 | backward_inner_microstep: 390298.69 | backward_inner: 390290.29 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.74 | reduce_tied_grads: 0.28 | comms: 18.07 | reduce_grads: 0.19 | step: 351.63 | _step_clipping: 0.13 | _step_step: 349.91 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.281 | iteration 5400/ 143000 | elapsed time per iteration (ms): 62896.1 | learning rate: 5.989E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.778242E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 21:39:50,916] [INFO] [logging.py:60:log_dist] [Rank 0] step=5410, skipped=0, lr=[0.0005988539373376443, 0.0005988539373376443], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5410 loss: 2.7634 iter time (s): 62.220 samples/sec: 16.458 %comms: 0.002862733802869294 %optimizer_step 0.055295874717937156 %forward: 23.337966604774206 %backward: 62.70403689823654 [2025-03-28 21:39:50,916] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16991.81 | forward: 145209.51 | backward_microstep: 390152.28 | backward: 390146.36 | backward_inner_microstep: 390131.71 | backward_inner: 390126.11 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.45 | reduce_tied_grads: 0.24 | comms: 17.81 | reduce_grads: 0.18 | step: 344.05 | _step_clipping: 0.11 | _step_step: 342.41 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.458 | iteration 5410/ 143000 | elapsed time per iteration (ms): 62220.8 | learning rate: 5.989E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.759751E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 21:50:18,292] [INFO] [logging.py:60:log_dist] [Rank 0] step=5420, skipped=0, lr=[0.0005988481746864567, 0.0005988481746864567], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5420 loss: 2.7625 iter time (s): 62.737 samples/sec: 16.322 %comms: 0.002891140708540921 %optimizer_step 0.05634169217633029 %forward: 23.166927911684514 %backward: 62.21394166482798 [2025-03-28 21:50:18,292] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21786.27 | forward: 145342.52 | backward_microstep: 390320.20 | backward: 390312.05 | backward_inner_microstep: 390296.85 | backward_inner: 390290.99 | backward_allreduce_microstep: 7.27 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.18 | step: 353.47 | _step_clipping: 0.13 | _step_step: 351.71 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.322 | iteration 5420/ 143000 | elapsed time per iteration (ms): 62737.6 | learning rate: 5.988E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.780057E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 22:00:46,188] [INFO] [logging.py:60:log_dist] [Rank 0] step=5430, skipped=0, lr=[0.0005988423976115163, 0.0005988423976115163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5430 loss: 2.7703 iter time (s): 62.789 samples/sec: 16.309 %comms: 0.002913275389809873 %optimizer_step 0.05634696769289684 %forward: 23.16746630930367 %backward: 62.17444311270294 [2025-03-28 22:00:46,189] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22098.75 | forward: 145466.39 | backward_microstep: 390396.60 | backward: 390387.62 | backward_inner_microstep: 390372.01 | backward_inner: 390365.88 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.53 | reduce_tied_grads: 0.30 | comms: 18.29 | reduce_grads: 0.18 | step: 353.80 | _step_clipping: 0.12 | _step_step: 351.98 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.308 | iteration 5430/ 143000 | elapsed time per iteration (ms): 62789.6 | learning rate: 5.988E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.766599E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 22:11:15,047] [INFO] [logging.py:60:log_dist] [Rank 0] step=5440, skipped=0, lr=[0.0005988366061131016, 0.0005988366061131016], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5440 loss: 2.7629 iter time (s): 62.885 samples/sec: 16.284 %comms: 0.00284591655562089 %optimizer_step 0.05550750505907979 %forward: 23.11147065059437 %backward: 62.06562407644981 [2025-03-28 22:11:15,048] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23296.30 | forward: 145337.34 | backward_microstep: 390309.62 | backward: 390301.99 | backward_inner_microstep: 390286.83 | backward_inner: 390280.98 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 349.06 | _step_clipping: 0.13 | _step_step: 347.43 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.283 | iteration 5440/ 143000 | elapsed time per iteration (ms): 62885.9 | learning rate: 5.988E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.769325E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 22:21:38,661] [INFO] [logging.py:60:log_dist] [Rank 0] step=5450, skipped=0, lr=[0.0005988308001914923, 0.0005988308001914923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5450 loss: 2.7678 iter time (s): 62.361 samples/sec: 16.421 %comms: 0.002876200106489751 %optimizer_step 0.056221511298638664 %forward: 23.299045833135683 %backward: 62.59054893376293 [2025-03-28 22:21:38,662] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18054.14 | forward: 145294.84 | backward_microstep: 390328.65 | backward: 390320.01 | backward_inner_microstep: 390304.57 | backward_inner: 390298.59 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.18 | step: 350.60 | _step_clipping: 0.14 | _step_step: 348.86 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.420 | iteration 5450/ 143000 | elapsed time per iteration (ms): 62361.4 | learning rate: 5.988E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.758309E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 22:32:01,476] [INFO] [logging.py:60:log_dist] [Rank 0] step=5460, skipped=0, lr=[0.0005988249798469686, 0.0005988249798469686], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5460 loss: 2.7705 iter time (s): 62.281 samples/sec: 16.442 %comms: 0.002890760143731761 %optimizer_step 0.05626738253812138 %forward: 23.333011786000892 %backward: 62.668427093171964 [2025-03-28 22:32:01,477] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17284.84 | forward: 145320.31 | backward_microstep: 390312.46 | backward: 390305.17 | backward_inner_microstep: 390290.02 | backward_inner: 390284.20 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 18.00 | reduce_grads: 0.18 | step: 350.44 | _step_clipping: 0.12 | _step_step: 348.66 | _step_zero_grad: 0.47 | _step_check_overflow: 0.63 samples/sec: 16.441 | iteration 5460/ 143000 | elapsed time per iteration (ms): 62281.5 | learning rate: 5.988E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.770070E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 22:42:24,611] [INFO] [logging.py:60:log_dist] [Rank 0] step=5470, skipped=0, lr=[0.0005988191450798114, 0.0005988191450798114], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5470 loss: 2.7675 iter time (s): 62.313 samples/sec: 16.433 %comms: 0.0029116250549061883 %optimizer_step 0.0565851131462191 %forward: 23.3319202857054 %backward: 62.65078888936628 [2025-03-28 22:42:24,612] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17396.18 | forward: 145387.94 | backward_microstep: 390404.22 | backward: 390395.17 | backward_inner_microstep: 390375.37 | backward_inner: 390367.41 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.74 | reduce_tied_grads: 0.37 | comms: 18.14 | reduce_grads: 0.21 | step: 352.60 | _step_clipping: 0.16 | _step_step: 350.78 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.433 | iteration 5470/ 143000 | elapsed time per iteration (ms): 62313.5 | learning rate: 5.988E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.764470E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 22:52:44,007] [INFO] [logging.py:60:log_dist] [Rank 0] step=5480, skipped=0, lr=[0.0005988132958903023, 0.0005988132958903023], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5480 loss: 2.7642 iter time (s): 61.939 samples/sec: 16.532 %comms: 0.0028987494270243013 %optimizer_step 0.05641296573053946 %forward: 23.47417732849785 %backward: 63.05195290906784 [2025-03-28 22:52:44,008] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13532.15 | forward: 145396.90 | backward_microstep: 390548.12 | backward: 390538.01 | backward_inner_microstep: 390522.19 | backward_inner: 390515.95 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.55 | reduce_tied_grads: 0.27 | comms: 17.95 | reduce_grads: 0.19 | step: 349.42 | _step_clipping: 0.11 | _step_step: 347.75 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.532 | iteration 5480/ 143000 | elapsed time per iteration (ms): 61939.6 | learning rate: 5.988E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.772106E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 23:03:03,257] [INFO] [logging.py:60:log_dist] [Rank 0] step=5490, skipped=0, lr=[0.0005988074322787236, 0.0005988074322787236], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5490 loss: 2.7743 iter time (s): 61.924 samples/sec: 16.536 %comms: 0.002921074680488894 %optimizer_step 0.05779534277704264 %forward: 23.481266438759825 %backward: 63.041902494226086 [2025-03-28 23:03:03,257] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13523.00 | forward: 145406.33 | backward_microstep: 390392.00 | backward: 390383.19 | backward_inner_microstep: 390367.67 | backward_inner: 390361.64 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.51 | reduce_tied_grads: 0.29 | comms: 18.09 | reduce_grads: 0.18 | step: 357.89 | _step_clipping: 0.12 | _step_step: 356.14 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.536 | iteration 5490/ 143000 | elapsed time per iteration (ms): 61925.0 | learning rate: 5.988E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.776254E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 23:13:30,315] [INFO] [logging.py:60:log_dist] [Rank 0] step=5500, skipped=0, lr=[0.0005988015542453583, 0.0005988015542453583], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5500 loss: 2.7702 iter time (s): 62.705 samples/sec: 16.330 %comms: 0.0028587300649736804 %optimizer_step 0.05637579487360933 %forward: 23.18968939584922 %backward: 62.25005744645936 [2025-03-28 23:13:30,315] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21383.90 | forward: 145411.53 | backward_microstep: 390348.55 | backward: 390340.56 | backward_inner_microstep: 390325.15 | backward_inner: 390319.21 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 353.51 | _step_clipping: 0.12 | _step_step: 351.78 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.330 | iteration 5500/ 143000 | elapsed time per iteration (ms): 62705.8 | learning rate: 5.988E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.770347E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 23:23:53,033] [INFO] [logging.py:60:log_dist] [Rank 0] step=5510, skipped=0, lr=[0.0005987956617904902, 0.0005987956617904902], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5510 loss: 2.7617 iter time (s): 62.271 samples/sec: 16.444 %comms: 0.0028732173329253053 %optimizer_step 0.05606342157173386 %forward: 23.327898094961768 %backward: 62.666924478743205 [2025-03-28 23:23:53,033] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17282.42 | forward: 145265.74 | backward_microstep: 390241.50 | backward: 390234.78 | backward_inner_microstep: 390216.36 | backward_inner: 390208.89 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 349.11 | _step_clipping: 0.14 | _step_step: 347.44 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.444 | iteration 5510/ 143000 | elapsed time per iteration (ms): 62271.8 | learning rate: 5.988E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.762017E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 23:34:17,088] [INFO] [logging.py:60:log_dist] [Rank 0] step=5520, skipped=0, lr=[0.0005987897549144036, 0.0005987897549144036], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5520 loss: 2.7733 iter time (s): 62.405 samples/sec: 16.409 %comms: 0.0028804678786669433 %optimizer_step 0.05692743607766507 %forward: 23.298809902231206 %backward: 62.53914938661461 [2025-03-28 23:34:17,089] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18463.48 | forward: 145396.30 | backward_microstep: 390283.76 | backward: 390275.76 | backward_inner_microstep: 390259.98 | backward_inner: 390252.24 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.55 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.19 | step: 355.26 | _step_clipping: 0.13 | _step_step: 353.54 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.409 | iteration 5520/ 143000 | elapsed time per iteration (ms): 62405.6 | learning rate: 5.988E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.761827E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 23:44:40,314] [INFO] [logging.py:60:log_dist] [Rank 0] step=5530, skipped=0, lr=[0.0005987838336173836, 0.0005987838336173836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5530 loss: 2.7744 iter time (s): 62.322 samples/sec: 16.431 %comms: 0.002879598129540646 %optimizer_step 0.05662527424540743 %forward: 23.33142643355114 %backward: 62.63181548178803 [2025-03-28 23:44:40,315] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17565.72 | forward: 145406.21 | backward_microstep: 390342.77 | backward: 390334.26 | backward_inner_microstep: 390318.80 | backward_inner: 390312.81 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.52 | reduce_tied_grads: 0.32 | comms: 17.95 | reduce_grads: 0.18 | step: 352.90 | _step_clipping: 0.14 | _step_step: 351.12 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.431 | iteration 5530/ 143000 | elapsed time per iteration (ms): 62322.6 | learning rate: 5.988E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.756711E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-28 23:55:09,912] [INFO] [logging.py:60:log_dist] [Rank 0] step=5540, skipped=0, lr=[0.0005987778978997159, 0.0005987778978997159], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5540 loss: 2.7519 iter time (s): 62.959 samples/sec: 16.264 %comms: 0.002864692271458214 %optimizer_step 0.05549174919642642 %forward: 23.093471205636405 %backward: 61.98973955977476 [2025-03-28 23:55:09,913] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23972.93 | forward: 145394.77 | backward_microstep: 390291.01 | backward: 390282.77 | backward_inner_microstep: 390266.76 | backward_inner: 390260.68 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.56 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.18 | step: 349.37 | _step_clipping: 0.11 | _step_step: 347.68 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.264 | iteration 5540/ 143000 | elapsed time per iteration (ms): 62959.9 | learning rate: 5.988E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.757325E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 00:05:32,688] [INFO] [logging.py:60:log_dist] [Rank 0] step=5550, skipped=0, lr=[0.0005987719477616873, 0.0005987719477616873], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5550 loss: 2.7593 iter time (s): 62.277 samples/sec: 16.443 %comms: 0.002892786550413384 %optimizer_step 0.05715008740834683 %forward: 23.33862177071741 %backward: 62.67596149818938 [2025-03-29 00:05:32,689] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17152.68 | forward: 145345.76 | backward_microstep: 390335.30 | backward: 390326.61 | backward_inner_microstep: 390310.95 | backward_inner: 390304.91 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.34 | comms: 18.02 | reduce_grads: 0.20 | step: 355.91 | _step_clipping: 0.14 | _step_step: 354.10 | _step_zero_grad: 0.49 | _step_check_overflow: 0.61 samples/sec: 16.443 | iteration 5550/ 143000 | elapsed time per iteration (ms): 62277.5 | learning rate: 5.988E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.763566E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 00:15:59,378] [INFO] [logging.py:60:log_dist] [Rank 0] step=5560, skipped=0, lr=[0.0005987659832035846, 0.0005987659832035846], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5560 loss: 2.7435 iter time (s): 62.668 samples/sec: 16.340 %comms: 0.0028430979585538207 %optimizer_step 0.05485580052807071 %forward: 23.183757255169425 %backward: 62.26076837333129 [2025-03-29 00:15:59,379] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21349.69 | forward: 145289.05 | backward_microstep: 390185.06 | backward: 390178.69 | backward_inner_microstep: 390163.94 | backward_inner: 390158.33 | backward_allreduce_microstep: 7.11 | backward_allreduce: 2.43 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.18 | step: 343.77 | _step_clipping: 0.11 | _step_step: 342.15 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.340 | iteration 5560/ 143000 | elapsed time per iteration (ms): 62669.0 | learning rate: 5.988E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.759292E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 00:26:23,042] [INFO] [logging.py:60:log_dist] [Rank 0] step=5570, skipped=0, lr=[0.0005987600042256959, 0.0005987600042256959], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5570 loss: 2.7572 iter time (s): 62.366 samples/sec: 16.419 %comms: 0.0028689339253398583 %optimizer_step 0.05598500189269129 %forward: 23.290547043309825 %backward: 62.57775046619488 [2025-03-29 00:26:23,043] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18230.42 | forward: 145253.56 | backward_microstep: 390278.62 | backward: 390271.69 | backward_inner_microstep: 390256.57 | backward_inner: 390249.21 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 349.16 | _step_clipping: 0.12 | _step_step: 347.53 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.419 | iteration 5570/ 143000 | elapsed time per iteration (ms): 62366.4 | learning rate: 5.988E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.749821E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 00:36:48,622] [INFO] [logging.py:60:log_dist] [Rank 0] step=5580, skipped=0, lr=[0.0005987540108283098, 0.0005987540108283098], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5580 loss: 2.7555 iter time (s): 62.557 samples/sec: 16.369 %comms: 0.002858590423791139 %optimizer_step 0.055551819881013556 %forward: 23.22883897339398 %backward: 62.38619095622688 [2025-03-29 00:36:48,622] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20090.51 | forward: 145313.47 | backward_microstep: 390278.09 | backward: 390271.51 | backward_inner_microstep: 390254.48 | backward_inner: 390248.82 | backward_allreduce_microstep: 9.28 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.18 | step: 347.52 | _step_clipping: 0.13 | _step_step: 345.85 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.369 | iteration 5580/ 143000 | elapsed time per iteration (ms): 62557.9 | learning rate: 5.988E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.754780E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 00:47:11,799] [INFO] [logging.py:60:log_dist] [Rank 0] step=5590, skipped=0, lr=[0.0005987480030117153, 0.0005987480030117153], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5590 loss: 2.7651 iter time (s): 62.317 samples/sec: 16.432 %comms: 0.002879400206945627 %optimizer_step 0.05636222846186426 %forward: 23.33092153931325 %backward: 62.648058395373255 [2025-03-29 00:47:11,800] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17430.74 | forward: 145391.81 | backward_microstep: 390414.65 | backward: 390405.26 | backward_inner_microstep: 390389.61 | backward_inner: 390383.65 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.53 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.18 | step: 351.23 | _step_clipping: 0.15 | _step_step: 349.54 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.432 | iteration 5590/ 143000 | elapsed time per iteration (ms): 62317.8 | learning rate: 5.987E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.762296E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 00:57:35,865] [INFO] [logging.py:60:log_dist] [Rank 0] step=5600, skipped=0, lr=[0.0005987419807762027, 0.0005987419807762027], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5600 loss: 2.7487 iter time (s): 62.406 samples/sec: 16.409 %comms: 0.0029437254760627724 %optimizer_step 0.057479284810854514 %forward: 23.301367850149436 %backward: 62.54976521938325 [2025-03-29 00:57:35,866] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18355.96 | forward: 145414.64 | backward_microstep: 390356.62 | backward: 390348.39 | backward_inner_microstep: 390332.93 | backward_inner: 390326.71 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.53 | reduce_tied_grads: 0.32 | comms: 18.37 | reduce_grads: 0.18 | step: 358.71 | _step_clipping: 0.13 | _step_step: 356.96 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.409 | iteration 5600/ 143000 | elapsed time per iteration (ms): 62406.6 | learning rate: 5.987E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.755911E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 01:07:59,367] [INFO] [logging.py:60:log_dist] [Rank 0] step=5610, skipped=0, lr=[0.0005987359441220624, 0.0005987359441220624], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5610 loss: 2.7531 iter time (s): 62.350 samples/sec: 16.424 %comms: 0.0028796247573849893 %optimizer_step 0.05653426956758835 %forward: 23.301858454012446 %backward: 62.59886033403084 [2025-03-29 01:07:59,368] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17989.30 | forward: 145286.19 | backward_microstep: 390309.92 | backward: 390301.48 | backward_inner_microstep: 390286.00 | backward_inner: 390280.07 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.53 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.18 | step: 352.49 | _step_clipping: 0.13 | _step_step: 350.76 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.423 | iteration 5610/ 143000 | elapsed time per iteration (ms): 62350.2 | learning rate: 5.987E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.749639E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 01:18:26,788] [INFO] [logging.py:60:log_dist] [Rank 0] step=5620, skipped=0, lr=[0.000598729893049586, 0.000598729893049586], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5620 loss: 2.7449 iter time (s): 62.742 samples/sec: 16.321 %comms: 0.002849929859284795 %optimizer_step 0.055186791460732035 %forward: 23.1542741037983 %backward: 62.18418375895076 [2025-03-29 01:18:26,789] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22105.43 | forward: 145273.63 | backward_microstep: 390162.35 | backward: 390153.53 | backward_inner_microstep: 390138.62 | backward_inner: 390132.93 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 346.25 | _step_clipping: 0.12 | _step_step: 344.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.321 | iteration 5620/ 143000 | elapsed time per iteration (ms): 62742.1 | learning rate: 5.987E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.757577E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 01:28:50,311] [INFO] [logging.py:60:log_dist] [Rank 0] step=5630, skipped=0, lr=[0.0005987238275590652, 0.0005987238275590652], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5630 loss: 2.7464 iter time (s): 62.352 samples/sec: 16.423 %comms: 0.00287184313570572 %optimizer_step 0.05610856117828928 %forward: 23.31511682885199 %backward: 62.6035787029009 [2025-03-29 01:28:50,311] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17912.75 | forward: 145373.69 | backward_microstep: 390352.54 | backward: 390343.90 | backward_inner_microstep: 390328.48 | backward_inner: 390322.56 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.50 | reduce_tied_grads: 0.30 | comms: 17.91 | reduce_grads: 0.18 | step: 349.85 | _step_clipping: 0.14 | _step_step: 348.14 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.423 | iteration 5630/ 143000 | elapsed time per iteration (ms): 62352.2 | learning rate: 5.987E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.752556E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 01:39:18,413] [INFO] [logging.py:60:log_dist] [Rank 0] step=5640, skipped=0, lr=[0.0005987177476507932, 0.0005987177476507932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5640 loss: 2.7664 iter time (s): 62.810 samples/sec: 16.303 %comms: 0.002845323166305364 %optimizer_step 0.055346390527847136 %forward: 23.134469045503682 %backward: 62.13121245474604 [2025-03-29 01:39:18,413] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22694.76 | forward: 145306.83 | backward_microstep: 390251.11 | backward: 390244.07 | backward_inner_microstep: 390226.84 | backward_inner: 390221.06 | backward_allreduce_microstep: 7.27 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.19 | step: 347.63 | _step_clipping: 0.12 | _step_step: 345.96 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.303 | iteration 5640/ 143000 | elapsed time per iteration (ms): 62810.2 | learning rate: 5.987E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.762811E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 01:49:41,992] [INFO] [logging.py:60:log_dist] [Rank 0] step=5650, skipped=0, lr=[0.000598711653325063, 0.000598711653325063], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5650 loss: 2.7536 iter time (s): 62.357 samples/sec: 16.421 %comms: 0.0028693240757293533 %optimizer_step 0.055938284568495325 %forward: 23.3054979865482 %backward: 62.57492264698627 [2025-03-29 01:49:41,993] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18173.88 | forward: 145327.04 | backward_microstep: 390207.88 | backward: 390200.99 | backward_inner_microstep: 390185.98 | backward_inner: 390180.35 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.17 | step: 348.82 | _step_clipping: 0.12 | _step_step: 347.14 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.421 | iteration 5650/ 143000 | elapsed time per iteration (ms): 62357.9 | learning rate: 5.987E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.763608E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 02:00:01,571] [INFO] [logging.py:60:log_dist] [Rank 0] step=5660, skipped=0, lr=[0.0005987055445821691, 0.0005987055445821691], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5660 loss: 2.7496 iter time (s): 61.957 samples/sec: 16.528 %comms: 0.0028764612412042006 %optimizer_step 0.05638133400368105 %forward: 23.45285442564328 %backward: 62.979644250440906 [2025-03-29 02:00:01,571] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14220.64 | forward: 145307.65 | backward_microstep: 390213.40 | backward: 390205.12 | backward_inner_microstep: 390190.18 | backward_inner: 390184.76 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.63 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.17 | step: 349.32 | _step_clipping: 0.10 | _step_step: 347.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.527 | iteration 5660/ 143000 | elapsed time per iteration (ms): 61957.8 | learning rate: 5.987E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.755313E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 02:10:25,282] [INFO] [logging.py:60:log_dist] [Rank 0] step=5670, skipped=0, lr=[0.0005986994214224061, 0.0005986994214224061], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5670 loss: 2.7467 iter time (s): 62.371 samples/sec: 16.418 %comms: 0.0028996053948371287 %optimizer_step 0.05792876718775893 %forward: 23.291545966603845 %backward: 62.55351007900048 [2025-03-29 02:10:25,282] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18436.95 | forward: 145270.69 | backward_microstep: 390156.63 | backward: 390149.78 | backward_inner_microstep: 390134.75 | backward_inner: 390129.07 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 18.09 | reduce_grads: 0.18 | step: 361.30 | _step_clipping: 0.14 | _step_step: 359.55 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.418 | iteration 5670/ 143000 | elapsed time per iteration (ms): 62371.1 | learning rate: 5.987E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.743833E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 02:20:48,651] [INFO] [logging.py:60:log_dist] [Rank 0] step=5680, skipped=0, lr=[0.0005986932838460697, 0.0005986932838460697], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5680 loss: 2.7572 iter time (s): 62.336 samples/sec: 16.427 %comms: 0.0028819585706455434 %optimizer_step 0.055852266070970366 %forward: 23.302561321508293 %backward: 62.59096547237619 [2025-03-29 02:20:48,651] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18106.41 | forward: 145259.67 | backward_microstep: 390175.90 | backward: 390169.24 | backward_inner_microstep: 390154.19 | backward_inner: 390148.51 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.97 | reduce_grads: 0.18 | step: 348.16 | _step_clipping: 0.14 | _step_step: 346.48 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.427 | iteration 5680/ 143000 | elapsed time per iteration (ms): 62336.9 | learning rate: 5.987E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.755005E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 02:31:15,556] [INFO] [logging.py:60:log_dist] [Rank 0] step=5690, skipped=0, lr=[0.000598687131853456, 0.000598687131853456], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5690 loss: 2.7475 iter time (s): 62.690 samples/sec: 16.334 %comms: 0.0028521986650855303 %optimizer_step 0.05564951604727002 %forward: 23.173750041733694 %backward: 62.237912299072974 [2025-03-29 02:31:15,557] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21630.87 | forward: 145276.29 | backward_microstep: 390176.06 | backward: 390169.61 | backward_inner_microstep: 390154.62 | backward_inner: 390149.06 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.88 | reduce_grads: 0.19 | step: 348.87 | _step_clipping: 0.14 | _step_step: 346.95 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.334 | iteration 5690/ 143000 | elapsed time per iteration (ms): 62690.6 | learning rate: 5.987E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.754621E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 02:41:38,491] [INFO] [logging.py:60:log_dist] [Rank 0] step=5700, skipped=0, lr=[0.0005986809654448618, 0.0005986809654448618], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5700 loss: 2.7581 iter time (s): 62.293 samples/sec: 16.438 %comms: 0.002885499686666601 %optimizer_step 0.05674346558557709 %forward: 23.329616243590927 %backward: 62.66601349526514 [2025-03-29 02:41:38,491] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17374.17 | forward: 145326.95 | backward_microstep: 390373.77 | backward: 390364.79 | backward_inner_microstep: 390349.45 | backward_inner: 390343.29 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.49 | reduce_tied_grads: 0.32 | comms: 17.97 | reduce_grads: 0.18 | step: 353.47 | _step_clipping: 0.13 | _step_step: 351.69 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.438 | iteration 5700/ 143000 | elapsed time per iteration (ms): 62293.5 | learning rate: 5.987E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.755102E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 02:51:57,507] [INFO] [logging.py:60:log_dist] [Rank 0] step=5710, skipped=0, lr=[0.0005986747846205851, 0.0005986747846205851], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5710 loss: 2.7251 iter time (s): 61.901 samples/sec: 16.543 %comms: 0.0028860448890432678 %optimizer_step 0.05655078069008671 %forward: 23.473317274399363 %backward: 63.03232782820961 [2025-03-29 02:51:57,508] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13674.75 | forward: 145302.47 | backward_microstep: 390183.65 | backward: 390177.20 | backward_inner_microstep: 390160.63 | backward_inner: 390155.09 | backward_allreduce_microstep: 8.89 | backward_allreduce: 2.50 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.18 | step: 350.06 | _step_clipping: 0.12 | _step_step: 348.41 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.542 | iteration 5710/ 143000 | elapsed time per iteration (ms): 61901.6 | learning rate: 5.987E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.744773E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 03:02:20,641] [INFO] [logging.py:60:log_dist] [Rank 0] step=5720, skipped=0, lr=[0.000598668589380924, 0.000598668589380924], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5720 loss: 2.7433 iter time (s): 62.313 samples/sec: 16.433 %comms: 0.002872026157677939 %optimizer_step 0.05571071882134957 %forward: 23.316860458840942 %backward: 62.62447102451518 [2025-03-29 03:02:20,642] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17755.66 | forward: 145294.01 | backward_microstep: 390237.86 | backward: 390230.93 | backward_inner_microstep: 390212.15 | backward_inner: 390206.32 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 347.15 | _step_clipping: 0.12 | _step_step: 345.46 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.433 | iteration 5720/ 143000 | elapsed time per iteration (ms): 62313.4 | learning rate: 5.987E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.738492E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 03:12:43,130] [INFO] [logging.py:60:log_dist] [Rank 0] step=5730, skipped=0, lr=[0.0005986623797261774, 0.0005986623797261774], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5730 loss: 2.7406 iter time (s): 62.248 samples/sec: 16.450 %comms: 0.0029261739417343465 %optimizer_step 0.05530652212661209 %forward: 23.335267437062054 %backward: 62.678954807789545 [2025-03-29 03:12:43,130] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17247.96 | forward: 145258.12 | backward_microstep: 390171.96 | backward: 390165.96 | backward_inner_microstep: 390151.38 | backward_inner: 390145.92 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.44 | reduce_tied_grads: 0.25 | comms: 18.21 | reduce_grads: 0.18 | step: 344.27 | _step_clipping: 0.12 | _step_step: 342.63 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.450 | iteration 5730/ 143000 | elapsed time per iteration (ms): 62248.8 | learning rate: 5.987E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.753576E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 03:23:11,860] [INFO] [logging.py:60:log_dist] [Rank 0] step=5740, skipped=0, lr=[0.0005986561556566452, 0.0005986561556566452], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5740 loss: 2.7423 iter time (s): 62.873 samples/sec: 16.287 %comms: 0.0031214641154492543 %optimizer_step 0.05840006616783426 %forward: 23.108356446195383 %backward: 62.05474237459981 [2025-03-29 03:23:11,860] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23456.52 | forward: 145288.01 | backward_microstep: 390159.69 | backward: 390153.68 | backward_inner_microstep: 390139.01 | backward_inner: 390133.59 | backward_allreduce_microstep: 7.09 | backward_allreduce: 2.44 | reduce_tied_grads: 0.24 | comms: 19.63 | reduce_grads: 1.97 | step: 367.18 | _step_clipping: 0.13 | _step_step: 365.55 | _step_zero_grad: 0.48 | _step_check_overflow: 0.48 samples/sec: 16.287 | iteration 5740/ 143000 | elapsed time per iteration (ms): 62873.0 | learning rate: 5.987E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.752509E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 03:33:34,246] [INFO] [logging.py:60:log_dist] [Rank 0] step=5750, skipped=0, lr=[0.0005986499171726276, 0.0005986499171726276], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5750 loss: 2.7312 iter time (s): 62.238 samples/sec: 16.453 %comms: 0.0028672381791874394 %optimizer_step 0.055241912838126794 %forward: 23.337479981292052 %backward: 62.694081264783144 [2025-03-29 03:33:34,247] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17143.92 | forward: 145248.11 | backward_microstep: 390202.29 | backward: 390196.23 | backward_inner_microstep: 390179.78 | backward_inner: 390174.27 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 343.82 | _step_clipping: 0.11 | _step_step: 342.21 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.453 | iteration 5750/ 143000 | elapsed time per iteration (ms): 62238.7 | learning rate: 5.986E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.745564E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 03:44:01,448] [INFO] [logging.py:60:log_dist] [Rank 0] step=5760, skipped=0, lr=[0.0005986436642744259, 0.0005986436642744259], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5760 loss: 2.7363 iter time (s): 62.720 samples/sec: 16.327 %comms: 0.002858795667711747 %optimizer_step 0.056316659085596465 %forward: 23.17410419900995 %backward: 62.22386142641736 [2025-03-29 03:44:01,449] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21760.16 | forward: 145347.19 | backward_microstep: 390273.49 | backward: 390265.94 | backward_inner_microstep: 390249.08 | backward_inner: 390243.16 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.18 | step: 353.22 | _step_clipping: 0.15 | _step_step: 351.47 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.326 | iteration 5760/ 143000 | elapsed time per iteration (ms): 62720.2 | learning rate: 5.986E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.735270E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 03:54:24,536] [INFO] [logging.py:60:log_dist] [Rank 0] step=5770, skipped=0, lr=[0.0005986373969623416, 0.0005986373969623416], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5770 loss: 2.7504 iter time (s): 62.308 samples/sec: 16.434 %comms: 0.0029343419903637433 %optimizer_step 0.0585649295031607 %forward: 23.344293695933498 %backward: 62.6660143268997 [2025-03-29 03:54:24,537] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17330.09 | forward: 145454.17 | backward_microstep: 390470.56 | backward: 390460.88 | backward_inner_microstep: 390445.42 | backward_inner: 390439.45 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.51 | reduce_tied_grads: 0.35 | comms: 18.28 | reduce_grads: 0.19 | step: 364.91 | _step_clipping: 0.13 | _step_step: 363.12 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.434 | iteration 5770/ 143000 | elapsed time per iteration (ms): 62308.8 | learning rate: 5.986E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.746807E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 04:04:43,511] [INFO] [logging.py:60:log_dist] [Rank 0] step=5780, skipped=0, lr=[0.0005986311152366776, 0.0005986311152366776], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5780 loss: 2.7652 iter time (s): 61.897 samples/sec: 16.544 %comms: 0.0028874318118198047 %optimizer_step 0.05699698414098974 %forward: 23.478308297358446 %backward: 63.03716077055776 [2025-03-29 04:04:43,512] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13684.48 | forward: 145323.66 | backward_microstep: 390188.67 | backward: 390181.05 | backward_inner_microstep: 390165.92 | backward_inner: 390160.14 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.50 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 352.79 | _step_clipping: 0.12 | _step_step: 351.12 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.543 | iteration 5780/ 143000 | elapsed time per iteration (ms): 61897.5 | learning rate: 5.986E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.749733E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 04:15:05,870] [INFO] [logging.py:60:log_dist] [Rank 0] step=5790, skipped=0, lr=[0.0005986248190977369, 0.0005986248190977369], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5790 loss: 2.7453 iter time (s): 62.235 samples/sec: 16.454 %comms: 0.0028966730811121285 %optimizer_step 0.05645258142143754 %forward: 23.346093221736567 %backward: 62.70212855887872 [2025-03-29 04:15:05,871] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17065.84 | forward: 145295.21 | backward_microstep: 390236.62 | backward: 390228.84 | backward_inner_microstep: 390213.48 | backward_inner: 390207.57 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 18.03 | reduce_grads: 0.18 | step: 351.33 | _step_clipping: 0.13 | _step_step: 349.52 | _step_zero_grad: 0.48 | _step_check_overflow: 0.65 samples/sec: 16.454 | iteration 5790/ 143000 | elapsed time per iteration (ms): 62235.9 | learning rate: 5.986E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.748354E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 04:25:32,814] [INFO] [logging.py:60:log_dist] [Rank 0] step=5800, skipped=0, lr=[0.0005986185085458233, 0.0005986185085458233], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5800 loss: 2.7613 iter time (s): 62.694 samples/sec: 16.333 %comms: 0.0028838967152177973 %optimizer_step 0.056818971634489866 %forward: 23.176201768816835 %backward: 62.25476711323369 [2025-03-29 04:25:32,814] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21575.04 | forward: 145300.34 | backward_microstep: 390306.27 | backward: 390298.58 | backward_inner_microstep: 390283.50 | backward_inner: 390277.49 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 18.08 | reduce_grads: 0.19 | step: 356.22 | _step_clipping: 0.15 | _step_step: 354.47 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.333 | iteration 5800/ 143000 | elapsed time per iteration (ms): 62694.3 | learning rate: 5.986E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.742916E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 04:35:50,322] [INFO] [logging.py:60:log_dist] [Rank 0] step=5810, skipped=0, lr=[0.0005986121835812413, 0.0005986121835812413], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5810 loss: 2.7318 iter time (s): 61.750 samples/sec: 16.583 %comms: 0.0028778419212621854 %optimizer_step 0.056468960433756606 %forward: 23.506645477576114 %backward: 63.18036072889023 [2025-03-29 04:35:50,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12483.39 | forward: 145154.30 | backward_microstep: 390146.20 | backward: 390140.78 | backward_inner_microstep: 390126.53 | backward_inner: 390121.15 | backward_allreduce_microstep: 6.95 | backward_allreduce: 2.37 | reduce_tied_grads: 0.23 | comms: 17.77 | reduce_grads: 0.17 | step: 348.70 | _step_clipping: 0.11 | _step_step: 347.06 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.583 | iteration 5810/ 143000 | elapsed time per iteration (ms): 61750.8 | learning rate: 5.986E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.743913E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 04:46:16,963] [INFO] [logging.py:60:log_dist] [Rank 0] step=5820, skipped=0, lr=[0.0005986058442042965, 0.0005986058442042965], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5820 loss: 2.7477 iter time (s): 62.664 samples/sec: 16.341 %comms: 0.002856410500721558 %optimizer_step 0.0559605345130373 %forward: 23.186652813233636 %backward: 62.28140421199569 [2025-03-29 04:46:16,963] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21305.67 | forward: 145295.74 | backward_microstep: 390284.08 | backward: 390277.23 | backward_inner_microstep: 390261.85 | backward_inner: 390254.29 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 350.67 | _step_clipping: 0.13 | _step_step: 348.98 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.341 | iteration 5820/ 143000 | elapsed time per iteration (ms): 62664.1 | learning rate: 5.986E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.744392E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 04:56:44,796] [INFO] [logging.py:60:log_dist] [Rank 0] step=5830, skipped=0, lr=[0.0005985994904152945, 0.0005985994904152945], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5830 loss: 2.7472 iter time (s): 62.783 samples/sec: 16.310 %comms: 0.002856719035964134 %optimizer_step 0.05657455097919039 %forward: 23.155021726864238 %backward: 62.16649935186644 [2025-03-29 04:56:44,797] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22367.34 | forward: 145373.66 | backward_microstep: 390306.40 | backward: 390298.56 | backward_inner_microstep: 390283.09 | backward_inner: 390277.21 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.18 | step: 355.19 | _step_clipping: 0.13 | _step_step: 351.79 | _step_zero_grad: 0.46 | _step_check_overflow: 2.26 samples/sec: 16.310 | iteration 5830/ 143000 | elapsed time per iteration (ms): 62783.3 | learning rate: 5.986E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.735394E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 05:07:07,296] [INFO] [logging.py:60:log_dist] [Rank 0] step=5840, skipped=0, lr=[0.0005985931222145423, 0.0005985931222145423], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5840 loss: 2.7469 iter time (s): 62.249 samples/sec: 16.450 %comms: 0.0028826523066939596 %optimizer_step 0.05642589663469666 %forward: 23.355825795059733 %backward: 62.70595380936898 [2025-03-29 05:07:07,296] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16963.23 | forward: 145388.60 | backward_microstep: 390348.82 | backward: 390340.76 | backward_inner_microstep: 390323.63 | backward_inner: 390317.66 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.49 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.18 | step: 351.25 | _step_clipping: 0.14 | _step_step: 349.49 | _step_zero_grad: 0.46 | _step_check_overflow: 0.60 samples/sec: 16.450 | iteration 5840/ 143000 | elapsed time per iteration (ms): 62249.9 | learning rate: 5.986E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.740989E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 05:17:30,022] [INFO] [logging.py:60:log_dist] [Rank 0] step=5850, skipped=0, lr=[0.0005985867396023469, 0.0005985867396023469], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5850 loss: 2.7192 iter time (s): 62.272 samples/sec: 16.444 %comms: 0.002879265835368844 %optimizer_step 0.05610002704051121 %forward: 23.333638403372127 %backward: 62.66702139542345 [2025-03-29 05:17:30,022] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17400.82 | forward: 145303.46 | backward_microstep: 390248.14 | backward: 390240.69 | backward_inner_microstep: 390225.35 | backward_inner: 390219.54 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.53 | reduce_tied_grads: 0.27 | comms: 17.93 | reduce_grads: 0.18 | step: 349.35 | _step_clipping: 0.12 | _step_step: 347.69 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.444 | iteration 5850/ 143000 | elapsed time per iteration (ms): 62272.6 | learning rate: 5.986E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.736852E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 05:27:53,874] [INFO] [logging.py:60:log_dist] [Rank 0] step=5860, skipped=0, lr=[0.0005985803425790167, 0.0005985803425790167], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5860 loss: 2.7470 iter time (s): 62.385 samples/sec: 16.414 %comms: 0.002877129502673897 %optimizer_step 0.056106872507326086 %forward: 23.30356296738328 %backward: 62.5612248337029 [2025-03-29 05:27:53,874] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18382.90 | forward: 145378.41 | backward_microstep: 390294.35 | backward: 390285.89 | backward_inner_microstep: 390270.37 | backward_inner: 390264.24 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.19 | step: 350.02 | _step_clipping: 0.12 | _step_step: 348.32 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.414 | iteration 5860/ 143000 | elapsed time per iteration (ms): 62385.2 | learning rate: 5.986E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.736215E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 05:38:20,742] [INFO] [logging.py:60:log_dist] [Rank 0] step=5870, skipped=0, lr=[0.0005985739311448602, 0.0005985739311448602], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5870 loss: 2.7565 iter time (s): 62.686 samples/sec: 16.335 %comms: 0.0028579208355350443 %optimizer_step 0.055786925063827056 %forward: 23.183602719407634 %backward: 62.25486023531403 [2025-03-29 05:38:20,743] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21516.74 | forward: 145329.43 | backward_microstep: 390263.10 | backward: 390252.70 | backward_inner_microstep: 390237.65 | backward_inner: 390231.78 | backward_allreduce_microstep: 7.08 | backward_allreduce: 2.43 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 349.71 | _step_clipping: 0.13 | _step_step: 348.04 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.335 | iteration 5870/ 143000 | elapsed time per iteration (ms): 62686.8 | learning rate: 5.986E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.746614E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 05:48:42,738] [INFO] [logging.py:60:log_dist] [Rank 0] step=5880, skipped=0, lr=[0.000598567505300187, 0.000598567505300187], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5880 loss: 2.7128 iter time (s): 62.199 samples/sec: 16.463 %comms: 0.002870188164787574 %optimizer_step 0.05615798863088868 %forward: 23.340223997935716 %backward: 62.709306540008114 [2025-03-29 05:48:42,739] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17086.50 | forward: 145174.05 | backward_microstep: 390051.85 | backward: 390046.12 | backward_inner_microstep: 390031.52 | backward_inner: 390026.23 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.44 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.17 | step: 349.30 | _step_clipping: 0.12 | _step_step: 347.61 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.463 | iteration 5880/ 143000 | elapsed time per iteration (ms): 62199.6 | learning rate: 5.986E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.735378E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 05:59:06,853] [INFO] [logging.py:60:log_dist] [Rank 0] step=5890, skipped=0, lr=[0.0005985610650453071, 0.0005985610650453071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5890 loss: 2.7217 iter time (s): 62.411 samples/sec: 16.407 %comms: 0.00287648775492346 %optimizer_step 0.056517418442708474 %forward: 23.291957984350756 %backward: 62.53633532684311 [2025-03-29 05:59:06,854] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18694.73 | forward: 145367.39 | backward_microstep: 390303.39 | backward: 390295.40 | backward_inner_microstep: 390280.34 | backward_inner: 390274.56 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.18 | step: 352.73 | _step_clipping: 0.12 | _step_step: 351.06 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.407 | iteration 5890/ 143000 | elapsed time per iteration (ms): 62411.5 | learning rate: 5.986E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.725951E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 06:09:37,546] [INFO] [logging.py:60:log_dist] [Rank 0] step=5900, skipped=0, lr=[0.0005985546103805315, 0.0005985546103805315], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5900 loss: 2.7341 iter time (s): 63.069 samples/sec: 16.236 %comms: 0.002880963806681917 %optimizer_step 0.055568692211191754 %forward: 23.045064661881185 %backward: 61.88825802220628 [2025-03-29 06:09:37,547] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25296.12 | forward: 145342.35 | backward_microstep: 390328.96 | backward: 390321.54 | backward_inner_microstep: 390304.12 | backward_inner: 390298.30 | backward_allreduce_microstep: 9.45 | backward_allreduce: 2.52 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.20 | step: 350.46 | _step_clipping: 0.13 | _step_step: 348.64 | _step_zero_grad: 0.48 | _step_check_overflow: 0.64 samples/sec: 16.236 | iteration 5900/ 143000 | elapsed time per iteration (ms): 63069.3 | learning rate: 5.986E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.730307E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 06:20:03,998] [INFO] [logging.py:60:log_dist] [Rank 0] step=5910, skipped=0, lr=[0.0005985481413061716, 0.0005985481413061716], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5910 loss: 2.7451 iter time (s): 62.645 samples/sec: 16.346 %comms: 0.0028649617099144346 %optimizer_step 0.057612497211351185 %forward: 23.225765800997703 %backward: 62.29586926599357 [2025-03-29 06:20:03,999] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20957.26 | forward: 145496.86 | backward_microstep: 390256.24 | backward: 390249.92 | backward_inner_microstep: 390234.83 | backward_inner: 390229.13 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.18 | step: 360.91 | _step_clipping: 0.12 | _step_step: 359.07 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.346 | iteration 5910/ 143000 | elapsed time per iteration (ms): 62645.2 | learning rate: 5.985E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.732564E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 06:30:26,487] [INFO] [logging.py:60:log_dist] [Rank 0] step=5920, skipped=0, lr=[0.0005985416578225397, 0.0005985416578225397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5920 loss: 2.7173 iter time (s): 62.248 samples/sec: 16.450 %comms: 0.0028716701357741083 %optimizer_step 0.05629650844744729 %forward: 23.340555643474666 %backward: 62.68412616064407 [2025-03-29 06:30:26,488] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17267.21 | forward: 145291.10 | backward_microstep: 390204.60 | backward: 390198.31 | backward_inner_microstep: 390183.47 | backward_inner: 390177.86 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.46 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.17 | step: 350.44 | _step_clipping: 0.12 | _step_step: 348.68 | _step_zero_grad: 0.46 | _step_check_overflow: 0.64 samples/sec: 16.450 | iteration 5920/ 143000 | elapsed time per iteration (ms): 62248.9 | learning rate: 5.985E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.728779E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 06:40:48,887] [INFO] [logging.py:60:log_dist] [Rank 0] step=5930, skipped=0, lr=[0.0005985351599299487, 0.0005985351599299487], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5930 loss: 2.7241 iter time (s): 62.239 samples/sec: 16.453 %comms: 0.002880354150988893 %optimizer_step 0.057988387483270565 %forward: 23.348324085538763 %backward: 62.706926178112454 [2025-03-29 06:40:48,888] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17054.73 | forward: 145318.72 | backward_microstep: 390290.68 | backward: 390284.55 | backward_inner_microstep: 390269.96 | backward_inner: 390264.63 | backward_allreduce_microstep: 7.10 | backward_allreduce: 2.44 | reduce_tied_grads: 0.25 | comms: 17.93 | reduce_grads: 0.17 | step: 360.92 | _step_clipping: 0.12 | _step_step: 359.12 | _step_zero_grad: 0.48 | _step_check_overflow: 0.64 samples/sec: 16.452 | iteration 5930/ 143000 | elapsed time per iteration (ms): 62240.0 | learning rate: 5.985E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.725099E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 06:51:17,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=5940, skipped=0, lr=[0.0005985286476287121, 0.0005985286476287121], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5940 loss: 2.7216 iter time (s): 62.870 samples/sec: 16.288 %comms: 0.0029033896474768123 %optimizer_step 0.05650788608103516 %forward: 23.139864861954308 %backward: 62.105260184765484 [2025-03-29 06:51:17,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22979.92 | forward: 145479.96 | backward_microstep: 390463.70 | backward: 390454.79 | backward_inner_microstep: 390439.19 | backward_inner: 390433.15 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 18.25 | reduce_grads: 0.22 | step: 355.26 | _step_clipping: 0.12 | _step_step: 353.36 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 16.287 | iteration 5940/ 143000 | elapsed time per iteration (ms): 62870.5 | learning rate: 5.985E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.726859E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:01:40,314] [INFO] [logging.py:60:log_dist] [Rank 0] step=5950, skipped=0, lr=[0.0005985221209191445, 0.0005985221209191445], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5950 loss: 2.7255 iter time (s): 62.272 samples/sec: 16.444 %comms: 0.0028713606682764605 %optimizer_step 0.05533058712472319 %forward: 23.332931512833394 %backward: 62.667730328985584 [2025-03-29 07:01:40,314] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17477.55 | forward: 145298.04 | backward_microstep: 390248.50 | backward: 390242.36 | backward_inner_microstep: 390227.21 | backward_inner: 390221.52 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 344.55 | _step_clipping: 0.13 | _step_step: 342.91 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.444 | iteration 5950/ 143000 | elapsed time per iteration (ms): 62272.2 | learning rate: 5.985E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.725247E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:11:58,770] [INFO] [logging.py:60:log_dist] [Rank 0] step=5960, skipped=0, lr=[0.0005985155798015606, 0.0005985155798015606], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5960 loss: 2.7465 iter time (s): 61.845 samples/sec: 16.558 %comms: 0.002894173646987014 %optimizer_step 0.055901568855881656 %forward: 23.47891370266598 %backward: 63.08739345843502 [2025-03-29 07:11:58,770] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13387.24 | forward: 145205.50 | backward_microstep: 390170.78 | backward: 390164.41 | backward_inner_microstep: 390149.50 | backward_inner: 390143.83 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.19 | step: 345.72 | _step_clipping: 0.11 | _step_step: 344.11 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.557 | iteration 5960/ 143000 | elapsed time per iteration (ms): 61845.6 | learning rate: 5.985E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.719966E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:22:26,525] [INFO] [logging.py:60:log_dist] [Rank 0] step=5970, skipped=0, lr=[0.0005985090242762762, 0.0005985090242762762], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5970 loss: 2.7419 iter time (s): 62.775 samples/sec: 16.312 %comms: 0.00284385391766487 %optimizer_step 0.060095793220337944 %forward: 23.151976917595604 %backward: 62.16443511247206 [2025-03-29 07:22:26,526] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22462.85 | forward: 145336.64 | backward_microstep: 390244.93 | backward: 390237.53 | backward_inner_microstep: 390222.39 | backward_inner: 390216.75 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 377.25 | _step_clipping: 0.11 | _step_step: 373.76 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.312 | iteration 5970/ 143000 | elapsed time per iteration (ms): 62775.6 | learning rate: 5.985E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.731717E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:32:52,939] [INFO] [logging.py:60:log_dist] [Rank 0] step=5980, skipped=0, lr=[0.0005985024543436078, 0.0005985024543436078], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5980 loss: 2.7257 iter time (s): 62.641 samples/sec: 16.347 %comms: 0.002849682357987048 %optimizer_step 0.05523856771788097 %forward: 23.203152868373202 %backward: 62.30949215358188 [2025-03-29 07:32:52,939] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21044.55 | forward: 145346.40 | backward_microstep: 390318.53 | backward: 390311.63 | backward_inner_microstep: 390296.38 | backward_inner: 390290.65 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.54 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 346.02 | _step_clipping: 0.12 | _step_step: 344.37 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.347 | iteration 5980/ 143000 | elapsed time per iteration (ms): 62641.3 | learning rate: 5.985E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.725422E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:43:11,818] [INFO] [logging.py:60:log_dist] [Rank 0] step=5990, skipped=0, lr=[0.0005984958700038725, 0.0005984958700038725], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 5990 loss: 2.7605 iter time (s): 61.887 samples/sec: 16.546 %comms: 0.002884179561476389 %optimizer_step 0.056686113373014185 %forward: 23.487838382217955 %backward: 63.0527986009927 [2025-03-29 07:43:11,819] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13622.79 | forward: 145360.19 | backward_microstep: 390223.76 | backward: 390217.55 | backward_inner_microstep: 390202.73 | backward_inner: 390197.26 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 350.82 | _step_clipping: 0.13 | _step_step: 349.17 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.546 | iteration 5990/ 143000 | elapsed time per iteration (ms): 61888.0 | learning rate: 5.985E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.736986E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:53:34,602] [INFO] [logging.py:60:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.000598489271257388, 0.000598489271257388], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6000 loss: 2.7393 iter time (s): 62.278 samples/sec: 16.442 %comms: 0.0028692772711016975 %optimizer_step 0.056047628944556435 %forward: 23.342428447282725 %backward: 62.657308071834784 [2025-03-29 07:53:34,602] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17520.34 | forward: 145371.56 | backward_microstep: 390222.78 | backward: 390216.07 | backward_inner_microstep: 390201.15 | backward_inner: 390195.44 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.19 | step: 349.05 | _step_clipping: 0.13 | _step_step: 347.38 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.442 | iteration 6000/ 143000 | elapsed time per iteration (ms): 62278.3 | learning rate: 5.985E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.738606E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 07:53:37,453] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step6000/mp_rank_00_model_states.pt [2025-03-29 07:53:51,008] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-29 07:53:51,012] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step6000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-29 08:04:19,764] [INFO] [logging.py:60:log_dist] [Rank 0] step=6010, skipped=0, lr=[0.0005984826581044726, 0.0005984826581044726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6010 loss: 2.7204 iter time (s): 62.873 samples/sec: 16.287 %comms: 0.002857528078355073 %optimizer_step 0.05633770032330376 %forward: 23.121883584280305 %backward: 62.06900615873445 [2025-03-29 08:04:19,765] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23415.97 | forward: 145375.30 | backward_microstep: 390257.09 | backward: 390249.38 | backward_inner_microstep: 390234.25 | backward_inner: 390228.42 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.48 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.23 | step: 354.21 | _step_clipping: 0.14 | _step_step: 352.49 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 15.872 | iteration 6010/ 143000 | elapsed time per iteration (ms): 64516.2 | learning rate: 5.985E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.727788E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 08:14:42,916] [INFO] [logging.py:60:log_dist] [Rank 0] step=6020, skipped=0, lr=[0.0005984760305454459, 0.0005984760305454459], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6020 loss: 2.7044 iter time (s): 62.315 samples/sec: 16.433 %comms: 0.0028730144383952577 %optimizer_step 0.05799937831841039 %forward: 23.321088792270626 %backward: 62.63579488822708 [2025-03-29 08:14:42,916] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17771.02 | forward: 145324.55 | backward_microstep: 390321.68 | backward: 390312.78 | backward_inner_microstep: 390290.33 | backward_inner: 390284.58 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.18 | step: 361.42 | _step_clipping: 0.13 | _step_step: 359.76 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.433 | iteration 6020/ 143000 | elapsed time per iteration (ms): 62315.2 | learning rate: 5.985E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.726619E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 08:25:11,885] [INFO] [logging.py:60:log_dist] [Rank 0] step=6030, skipped=0, lr=[0.0005984693885806274, 0.0005984693885806274], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6030 loss: 2.7261 iter time (s): 62.896 samples/sec: 16.281 %comms: 0.0028416678473939037 %optimizer_step 0.05483799172652187 %forward: 23.114407613445472 %backward: 62.04880449765685 [2025-03-29 08:25:11,886] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23599.75 | forward: 145381.15 | backward_microstep: 390271.24 | backward: 390264.23 | backward_inner_microstep: 390245.65 | backward_inner: 390239.91 | backward_allreduce_microstep: 8.99 | backward_allreduce: 4.23 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 344.91 | _step_clipping: 0.12 | _step_step: 343.29 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.281 | iteration 6030/ 143000 | elapsed time per iteration (ms): 62896.9 | learning rate: 5.985E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.726556E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 08:35:38,660] [INFO] [logging.py:60:log_dist] [Rank 0] step=6040, skipped=0, lr=[0.000598462732210338, 0.000598462732210338], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6040 loss: 2.7330 iter time (s): 62.677 samples/sec: 16.338 %comms: 0.0028556498409780464 %optimizer_step 0.05560184676717461 %forward: 23.184457316168636 %backward: 62.25113336967166 [2025-03-29 08:35:38,660] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21589.61 | forward: 145312.94 | backward_microstep: 390176.80 | backward: 390170.66 | backward_inner_microstep: 390155.64 | backward_inner: 390150.02 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.20 | step: 348.50 | _step_clipping: 0.12 | _step_step: 346.90 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.338 | iteration 6040/ 143000 | elapsed time per iteration (ms): 62677.4 | learning rate: 5.985E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.724257E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 08:46:00,759] [INFO] [logging.py:60:log_dist] [Rank 0] step=6050, skipped=0, lr=[0.0005984560614348985, 0.0005984560614348985], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6050 loss: 2.7263 iter time (s): 62.209 samples/sec: 16.461 %comms: 0.0029060048349811426 %optimizer_step 0.05710980730358015 %forward: 23.355719448232488 %backward: 62.75259520517466 [2025-03-29 08:46:00,760] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16685.79 | forward: 145294.59 | backward_microstep: 390387.80 | backward: 390380.28 | backward_inner_microstep: 390363.41 | backward_inner: 390357.55 | backward_allreduce_microstep: 8.97 | backward_allreduce: 2.49 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.19 | step: 355.28 | _step_clipping: 0.14 | _step_step: 353.52 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.460 | iteration 6050/ 143000 | elapsed time per iteration (ms): 62210.0 | learning rate: 5.985E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.718777E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 08:56:28,033] [INFO] [logging.py:60:log_dist] [Rank 0] step=6060, skipped=0, lr=[0.0005984493762546315, 0.0005984493762546315], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6060 loss: 2.7223 iter time (s): 62.727 samples/sec: 16.325 %comms: 0.002897846672856106 %optimizer_step 0.056652539468073235 %forward: 23.171516969763854 %backward: 62.22293187704937 [2025-03-29 08:56:28,034] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21896.05 | forward: 145347.56 | backward_microstep: 390312.86 | backward: 390304.67 | backward_inner_microstep: 390289.33 | backward_inner: 390281.77 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 18.18 | reduce_grads: 0.18 | step: 355.36 | _step_clipping: 0.12 | _step_step: 353.61 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.325 | iteration 6060/ 143000 | elapsed time per iteration (ms): 62727.4 | learning rate: 5.984E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.715639E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 09:06:50,129] [INFO] [logging.py:60:log_dist] [Rank 0] step=6070, skipped=0, lr=[0.0005984426766698592, 0.0005984426766698592], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6070 loss: 2.7437 iter time (s): 62.209 samples/sec: 16.461 %comms: 0.0028647424060443804 %optimizer_step 0.05618701588471917 %forward: 23.355936930709913 %backward: 62.720472402538064 [2025-03-29 09:06:50,130] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16923.47 | forward: 145295.24 | backward_microstep: 390184.57 | backward: 390178.57 | backward_inner_microstep: 390163.69 | backward_inner: 390158.21 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.19 | step: 349.53 | _step_clipping: 0.13 | _step_step: 347.91 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.460 | iteration 6070/ 143000 | elapsed time per iteration (ms): 62209.6 | learning rate: 5.984E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.724051E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 09:17:09,161] [INFO] [logging.py:60:log_dist] [Rank 0] step=6080, skipped=0, lr=[0.000598435962680905, 0.000598435962680905], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6080 loss: 2.7102 iter time (s): 61.903 samples/sec: 16.542 %comms: 0.002888439980411605 %optimizer_step 0.056487363691327314 %forward: 23.487638968381113 %backward: 63.03866716799206 [2025-03-29 09:17:09,161] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13686.72 | forward: 145394.65 | backward_microstep: 390234.26 | backward: 390225.89 | backward_inner_microstep: 390210.99 | backward_inner: 390205.46 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.49 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.20 | step: 349.67 | _step_clipping: 0.11 | _step_step: 348.03 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.542 | iteration 6080/ 143000 | elapsed time per iteration (ms): 61903.1 | learning rate: 5.984E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.722537E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 09:27:36,729] [INFO] [logging.py:60:log_dist] [Rank 0] step=6090, skipped=0, lr=[0.0005984292342880931, 0.0005984292342880931], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6090 loss: 2.7161 iter time (s): 62.756 samples/sec: 16.317 %comms: 0.0028567853513827633 %optimizer_step 0.055973632577842825 %forward: 23.173906016529074 %backward: 62.19662631885772 [2025-03-29 09:27:36,730] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22041.24 | forward: 145430.82 | backward_microstep: 390330.30 | backward: 390322.92 | backward_inner_microstep: 390307.69 | backward_inner: 390301.90 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.50 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.18 | step: 351.27 | _step_clipping: 0.13 | _step_step: 349.50 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.317 | iteration 6090/ 143000 | elapsed time per iteration (ms): 62756.9 | learning rate: 5.984E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.717242E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 09:38:00,734] [INFO] [logging.py:60:log_dist] [Rank 0] step=6100, skipped=0, lr=[0.0005984224914917481, 0.0005984224914917481], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6100 loss: 2.7161 iter time (s): 62.400 samples/sec: 16.410 %comms: 0.002928160642362907 %optimizer_step 0.05599386526637247 %forward: 23.288587449441167 %backward: 62.5475557644127 [2025-03-29 09:38:00,734] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18615.37 | forward: 145320.49 | backward_microstep: 390303.78 | backward: 390295.94 | backward_inner_microstep: 390280.44 | backward_inner: 390274.44 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 18.27 | reduce_grads: 0.18 | step: 349.40 | _step_clipping: 0.12 | _step_step: 347.72 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.410 | iteration 6100/ 143000 | elapsed time per iteration (ms): 62400.4 | learning rate: 5.984E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.709974E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 09:48:24,305] [INFO] [logging.py:60:log_dist] [Rank 0] step=6110, skipped=0, lr=[0.0005984157342921954, 0.0005984157342921954], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6110 loss: 2.7090 iter time (s): 62.357 samples/sec: 16.422 %comms: 0.002879529083025036 %optimizer_step 0.057075140142970174 %forward: 23.308121316279298 %backward: 62.59790330059888 [2025-03-29 09:48:24,306] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18117.39 | forward: 145341.65 | backward_microstep: 390346.57 | backward: 390339.58 | backward_inner_microstep: 390324.56 | backward_inner: 390318.86 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.18 | step: 355.90 | _step_clipping: 0.13 | _step_step: 354.19 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.422 | iteration 6110/ 143000 | elapsed time per iteration (ms): 62357.2 | learning rate: 5.984E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.703621E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 09:58:52,666] [INFO] [logging.py:60:log_dist] [Rank 0] step=6120, skipped=0, lr=[0.0005984089626897614, 0.0005984089626897614], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6120 loss: 2.7115 iter time (s): 62.836 samples/sec: 16.297 %comms: 0.0028643387017005285 %optimizer_step 0.05601988563937448 %forward: 23.129976296654352 %backward: 62.12495448826094 [2025-03-29 09:58:52,667] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22896.42 | forward: 145338.39 | backward_microstep: 390373.77 | backward: 390365.32 | backward_inner_microstep: 390350.03 | backward_inner: 390344.08 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.50 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.19 | step: 352.00 | _step_clipping: 0.12 | _step_step: 350.26 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.296 | iteration 6120/ 143000 | elapsed time per iteration (ms): 62836.1 | learning rate: 5.984E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.714636E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 10:09:15,386] [INFO] [logging.py:60:log_dist] [Rank 0] step=6130, skipped=0, lr=[0.0005984021766847727, 0.0005984021766847727], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6130 loss: 2.7339 iter time (s): 62.271 samples/sec: 16.444 %comms: 0.0028786878278438286 %optimizer_step 0.05575592608913351 %forward: 23.334085920022197 %backward: 62.680382406273985 [2025-03-29 10:09:15,386] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17370.22 | forward: 145304.50 | backward_microstep: 390326.04 | backward: 390319.20 | backward_inner_microstep: 390300.66 | backward_inner: 390294.96 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 347.20 | _step_clipping: 0.12 | _step_step: 345.48 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.444 | iteration 6130/ 143000 | elapsed time per iteration (ms): 62271.9 | learning rate: 5.984E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.721869E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 10:19:42,143] [INFO] [logging.py:60:log_dist] [Rank 0] step=6140, skipped=0, lr=[0.0005983953762775568, 0.0005983953762775568], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6140 loss: 2.7132 iter time (s): 62.675 samples/sec: 16.338 %comms: 0.002842713814749628 %optimizer_step 0.055429666941697674 %forward: 23.17255875756295 %backward: 62.25109499744919 [2025-03-29 10:19:42,144] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21667.84 | forward: 145234.61 | backward_microstep: 390166.04 | backward: 390160.34 | backward_inner_microstep: 390145.96 | backward_inner: 390140.65 | backward_allreduce_microstep: 7.01 | backward_allreduce: 2.43 | reduce_tied_grads: 0.23 | comms: 17.82 | reduce_grads: 0.18 | step: 347.41 | _step_clipping: 0.15 | _step_step: 345.73 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.338 | iteration 6140/ 143000 | elapsed time per iteration (ms): 62675.8 | learning rate: 5.984E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.720461E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 10:30:04,782] [INFO] [logging.py:60:log_dist] [Rank 0] step=6150, skipped=0, lr=[0.0005983885614684421, 0.0005983885614684421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6150 loss: 2.7164 iter time (s): 62.263 samples/sec: 16.446 %comms: 0.002880856843056746 %optimizer_step 0.05665822975711511 %forward: 23.339332362678768 %backward: 62.6911818166743 [2025-03-29 10:30:04,783] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17205.61 | forward: 145318.53 | backward_microstep: 390345.20 | backward: 390336.38 | backward_inner_microstep: 390315.71 | backward_inner: 390308.23 | backward_allreduce_microstep: 9.54 | backward_allreduce: 4.68 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.20 | step: 352.77 | _step_clipping: 0.14 | _step_step: 351.02 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.446 | iteration 6150/ 143000 | elapsed time per iteration (ms): 62263.9 | learning rate: 5.984E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.713286E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 10:40:33,021] [INFO] [logging.py:60:log_dist] [Rank 0] step=6160, skipped=0, lr=[0.0005983817322577574, 0.0005983817322577574], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6160 loss: 2.7234 iter time (s): 62.823 samples/sec: 16.300 %comms: 0.0031280851819329406 %optimizer_step 0.05705202324177651 %forward: 23.148856077408368 %backward: 62.13927274623317 [2025-03-29 10:40:33,021] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22647.48 | forward: 145428.66 | backward_microstep: 390387.44 | backward: 390379.18 | backward_inner_microstep: 390363.75 | backward_inner: 390357.89 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.70 | reduce_tied_grads: 0.31 | comms: 19.65 | reduce_grads: 0.18 | step: 358.42 | _step_clipping: 0.12 | _step_step: 356.70 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.300 | iteration 6160/ 143000 | elapsed time per iteration (ms): 62823.8 | learning rate: 5.984E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.703009E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 10:50:56,329] [INFO] [logging.py:60:log_dist] [Rank 0] step=6170, skipped=0, lr=[0.0005983748886458322, 0.0005983748886458322], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6170 loss: 2.7196 iter time (s): 62.330 samples/sec: 16.429 %comms: 0.0029037364227854643 %optimizer_step 0.05718558755563093 %forward: 23.323406666478277 %backward: 62.614104992259854 [2025-03-29 10:50:56,329] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17919.79 | forward: 145375.44 | backward_microstep: 390285.00 | backward: 390275.45 | backward_inner_microstep: 390254.35 | backward_inner: 390246.83 | backward_allreduce_microstep: 9.58 | backward_allreduce: 4.85 | reduce_tied_grads: 0.29 | comms: 18.10 | reduce_grads: 0.19 | step: 356.44 | _step_clipping: 0.14 | _step_step: 354.71 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.428 | iteration 6170/ 143000 | elapsed time per iteration (ms): 62330.8 | learning rate: 5.984E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.723215E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 11:01:23,520] [INFO] [logging.py:60:log_dist] [Rank 0] step=6180, skipped=0, lr=[0.000598368030632997, 0.000598368030632997], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6180 loss: 2.7092 iter time (s): 62.719 samples/sec: 16.327 %comms: 0.0028730616343175517 %optimizer_step 0.05535605426050692 %forward: 23.188897312322705 %backward: 62.24312022033948 [2025-03-29 11:01:23,521] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21635.98 | forward: 145437.50 | backward_microstep: 390388.06 | backward: 390380.08 | backward_inner_microstep: 390358.99 | backward_inner: 390353.12 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.51 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.18 | step: 347.19 | _step_clipping: 0.13 | _step_step: 345.52 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.327 | iteration 6180/ 143000 | elapsed time per iteration (ms): 62719.1 | learning rate: 5.984E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.710232E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 11:11:46,222] [INFO] [logging.py:60:log_dist] [Rank 0] step=6190, skipped=0, lr=[0.0005983611582195828, 0.0005983611582195828], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6190 loss: 2.7013 iter time (s): 62.270 samples/sec: 16.445 %comms: 0.002895035036086396 %optimizer_step 0.05550854076054973 %forward: 23.33118659841157 %backward: 62.66086502799181 [2025-03-29 11:11:46,223] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17520.03 | forward: 145282.67 | backward_microstep: 390194.07 | backward: 390187.53 | backward_inner_microstep: 390172.50 | backward_inner: 390166.92 | backward_allreduce_microstep: 7.27 | backward_allreduce: 2.48 | reduce_tied_grads: 0.24 | comms: 18.03 | reduce_grads: 0.17 | step: 345.65 | _step_clipping: 0.12 | _step_step: 344.04 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.444 | iteration 6190/ 143000 | elapsed time per iteration (ms): 62270.2 | learning rate: 5.984E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.710469E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 11:22:08,639] [INFO] [logging.py:60:log_dist] [Rank 0] step=6200, skipped=0, lr=[0.000598354271405921, 0.000598354271405921], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6200 loss: 2.7000 iter time (s): 62.241 samples/sec: 16.452 %comms: 0.002880966154925972 %optimizer_step 0.05655074934691972 %forward: 23.33890080074694 %backward: 62.70012697855455 [2025-03-29 11:22:08,640] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17150.54 | forward: 145263.98 | backward_microstep: 390259.35 | backward: 390252.73 | backward_inner_microstep: 390237.84 | backward_inner: 390232.30 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 351.98 | _step_clipping: 0.12 | _step_step: 350.28 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.452 | iteration 6200/ 143000 | elapsed time per iteration (ms): 62241.7 | learning rate: 5.984E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.703740E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 11:32:32,383] [INFO] [logging.py:60:log_dist] [Rank 0] step=6210, skipped=0, lr=[0.0005983473701923444, 0.0005983473701923444], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6210 loss: 2.7081 iter time (s): 62.374 samples/sec: 16.417 %comms: 0.0029067167833403927 %optimizer_step 0.0573847219847542 %forward: 23.310566496934893 %backward: 62.57328735616618 [2025-03-29 11:32:32,383] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18286.67 | forward: 145396.90 | backward_microstep: 390304.69 | backward: 390293.49 | backward_inner_microstep: 390276.48 | backward_inner: 390270.68 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.32 | comms: 18.13 | reduce_grads: 0.20 | step: 357.93 | _step_clipping: 0.15 | _step_step: 356.12 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.417 | iteration 6210/ 143000 | elapsed time per iteration (ms): 62374.4 | learning rate: 5.983E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.696252E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 11:42:59,172] [INFO] [logging.py:60:log_dist] [Rank 0] step=6220, skipped=0, lr=[0.0005983404545791859, 0.0005983404545791859], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6220 loss: 2.7050 iter time (s): 62.678 samples/sec: 16.337 %comms: 0.0028556584881310773 %optimizer_step 0.060111434296494806 %forward: 23.18231073253241 %backward: 62.25417514964538 [2025-03-29 11:42:59,172] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21568.26 | forward: 145302.91 | backward_microstep: 390205.70 | backward: 390198.94 | backward_inner_microstep: 390184.09 | backward_inner: 390178.53 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.90 | reduce_grads: 0.18 | step: 376.77 | _step_clipping: 0.11 | _step_step: 375.14 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.337 | iteration 6220/ 143000 | elapsed time per iteration (ms): 62678.9 | learning rate: 5.983E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.700138E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 11:53:22,716] [INFO] [logging.py:60:log_dist] [Rank 0] step=6230, skipped=0, lr=[0.0005983335245667791, 0.0005983335245667791], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6230 loss: 2.7070 iter time (s): 62.354 samples/sec: 16.422 %comms: 0.0029532642244753113 %optimizer_step 0.05780914253289913 %forward: 23.314898584703116 %backward: 62.618096758449205 [2025-03-29 11:53:22,716] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17958.22 | forward: 145377.34 | backward_microstep: 390456.03 | backward: 390447.86 | backward_inner_microstep: 390430.60 | backward_inner: 390424.61 | backward_allreduce_microstep: 9.15 | backward_allreduce: 4.30 | reduce_tied_grads: 0.34 | comms: 18.41 | reduce_grads: 0.20 | step: 360.46 | _step_clipping: 0.15 | _step_step: 358.49 | _step_zero_grad: 0.51 | _step_check_overflow: 0.69 samples/sec: 16.422 | iteration 6230/ 143000 | elapsed time per iteration (ms): 62354.4 | learning rate: 5.983E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.705758E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 12:03:45,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=6240, skipped=0, lr=[0.0005983265801554587, 0.0005983265801554587], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6240 loss: 2.6804 iter time (s): 62.323 samples/sec: 16.430 %comms: 0.0028870810740838754 %optimizer_step 0.05679174506792481 %forward: 23.34384132426443 %backward: 62.674068185803776 [2025-03-29 12:03:45,954] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17341.23 | forward: 145486.28 | backward_microstep: 390615.88 | backward: 390604.82 | backward_inner_microstep: 390588.99 | backward_inner: 390582.87 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.53 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.18 | step: 353.94 | _step_clipping: 0.14 | _step_step: 352.14 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.430 | iteration 6240/ 143000 | elapsed time per iteration (ms): 62323.8 | learning rate: 5.983E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.696881E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 12:14:10,913] [INFO] [logging.py:60:log_dist] [Rank 0] step=6250, skipped=0, lr=[0.0005983196213455598, 0.0005983196213455598], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6250 loss: 2.7110 iter time (s): 62.495 samples/sec: 16.385 %comms: 0.0028795441266883225 %optimizer_step 0.05662775361190141 %forward: 23.29460447834142 %backward: 62.516590964826236 [2025-03-29 12:14:10,914] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18876.87 | forward: 145580.63 | backward_microstep: 390709.41 | backward: 390700.11 | backward_inner_microstep: 390684.63 | backward_inner: 390678.49 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.18 | step: 353.90 | _step_clipping: 0.11 | _step_step: 352.19 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.385 | iteration 6250/ 143000 | elapsed time per iteration (ms): 62496.0 | learning rate: 5.983E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.701978E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 12:24:37,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=6260, skipped=0, lr=[0.0005983126481374182, 0.0005983126481374182], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6260 loss: 2.7141 iter time (s): 62.686 samples/sec: 16.335 %comms: 0.002873940932437381 %optimizer_step 0.05663534514903659 %forward: 23.190680540145568 %backward: 62.27216448452081 [2025-03-29 12:24:37,782] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21411.91 | forward: 145373.40 | backward_microstep: 390368.21 | backward: 390360.09 | backward_inner_microstep: 390343.06 | backward_inner: 390337.20 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.46 | reduce_tied_grads: 0.36 | comms: 18.02 | reduce_grads: 0.19 | step: 355.03 | _step_clipping: 0.13 | _step_step: 353.07 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.335 | iteration 6260/ 143000 | elapsed time per iteration (ms): 62686.8 | learning rate: 5.983E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.709133E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 12:35:04,736] [INFO] [logging.py:60:log_dist] [Rank 0] step=6270, skipped=0, lr=[0.0005983056605313707, 0.0005983056605313707], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6270 loss: 2.7280 iter time (s): 62.695 samples/sec: 16.333 %comms: 0.0028504970076757737 %optimizer_step 0.05681230640418064 %forward: 23.183437651711213 %backward: 62.24985794014224 [2025-03-29 12:35:04,736] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21612.62 | forward: 145348.16 | backward_microstep: 390281.12 | backward: 390274.40 | backward_inner_microstep: 390259.54 | backward_inner: 390253.97 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.46 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.19 | step: 356.18 | _step_clipping: 0.13 | _step_step: 354.55 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.333 | iteration 6270/ 143000 | elapsed time per iteration (ms): 62695.4 | learning rate: 5.983E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.721284E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 12:45:22,713] [INFO] [logging.py:60:log_dist] [Rank 0] step=6280, skipped=0, lr=[0.0005982986585277543, 0.0005982986585277543], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6280 loss: 2.7356 iter time (s): 61.797 samples/sec: 16.570 %comms: 0.002891941961292005 %optimizer_step 0.056127972256690806 %forward: 23.49586726821441 %backward: 63.14050812677995 [2025-03-29 12:45:22,713] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12888.05 | forward: 145197.79 | backward_microstep: 390196.71 | backward: 390190.40 | backward_inner_microstep: 390175.46 | backward_inner: 390169.86 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.18 | step: 346.85 | _step_clipping: 0.11 | _step_step: 345.24 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.570 | iteration 6280/ 143000 | elapsed time per iteration (ms): 61797.7 | learning rate: 5.983E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.725117E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 12:55:49,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=6290, skipped=0, lr=[0.000598291642126907, 0.000598291642126907], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6290 loss: 2.7150 iter time (s): 62.705 samples/sec: 16.330 %comms: 0.002881939094309203 %optimizer_step 0.055990058431022366 %forward: 23.164486181038548 %backward: 62.21469923673959 [2025-03-29 12:55:49,768] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21975.58 | forward: 145252.71 | backward_microstep: 390123.72 | backward: 390116.74 | backward_inner_microstep: 390101.68 | backward_inner: 390095.94 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.50 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.18 | step: 351.09 | _step_clipping: 0.12 | _step_step: 349.36 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.330 | iteration 6290/ 143000 | elapsed time per iteration (ms): 62705.5 | learning rate: 5.983E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.705901E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 13:06:11,484] [INFO] [logging.py:60:log_dist] [Rank 0] step=6300, skipped=0, lr=[0.0005982846113291674, 0.0005982846113291674], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6300 loss: 2.6814 iter time (s): 62.171 samples/sec: 16.471 %comms: 0.0028843606797871557 %optimizer_step 0.05649640912013019 %forward: 23.34383322329117 %backward: 62.705649405757654 [2025-03-29 13:06:11,485] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17086.79 | forward: 145131.40 | backward_microstep: 389855.46 | backward: 389848.52 | backward_inner_microstep: 389833.65 | backward_inner: 389828.00 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.45 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.19 | step: 351.24 | _step_clipping: 0.14 | _step_step: 349.49 | _step_zero_grad: 0.46 | _step_check_overflow: 0.61 samples/sec: 16.471 | iteration 6300/ 143000 | elapsed time per iteration (ms): 62171.7 | learning rate: 5.983E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.703020E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 13:16:37,710] [INFO] [logging.py:60:log_dist] [Rank 0] step=6310, skipped=0, lr=[0.0005982775661348751, 0.0005982775661348751], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6310 loss: 2.7022 iter time (s): 62.622 samples/sec: 16.352 %comms: 0.0029123263988680773 %optimizer_step 0.058103603391101835 %forward: 23.18065831681969 %backward: 62.26623205272146 [2025-03-29 13:16:37,711] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21478.14 | forward: 145162.08 | backward_microstep: 389931.22 | backward: 389924.03 | backward_inner_microstep: 389908.79 | backward_inner: 389903.00 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 18.24 | reduce_grads: 0.39 | step: 363.86 | _step_clipping: 0.12 | _step_step: 362.16 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.352 | iteration 6310/ 143000 | elapsed time per iteration (ms): 62622.6 | learning rate: 5.983E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.699936E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 13:27:01,400] [INFO] [logging.py:60:log_dist] [Rank 0] step=6320, skipped=0, lr=[0.0005982705065443697, 0.0005982705065443697], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6320 loss: 2.6899 iter time (s): 62.368 samples/sec: 16.419 %comms: 0.0028898791398352516 %optimizer_step 0.056731954915941935 %forward: 23.292723582308724 %backward: 62.53561543234789 [2025-03-29 13:27:01,401] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18732.54 | forward: 145273.12 | backward_microstep: 390031.93 | backward: 390024.96 | backward_inner_microstep: 390006.14 | backward_inner: 390000.35 | backward_allreduce_microstep: 11.00 | backward_allreduce: 2.51 | reduce_tied_grads: 0.27 | comms: 18.02 | reduce_grads: 0.18 | step: 353.83 | _step_clipping: 0.11 | _step_step: 352.09 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.418 | iteration 6320/ 143000 | elapsed time per iteration (ms): 62369.0 | learning rate: 5.983E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.698963E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 13:37:23,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=6330, skipped=0, lr=[0.0005982634325579924, 0.0005982634325579924], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6330 loss: 2.7116 iter time (s): 62.225 samples/sec: 16.456 %comms: 0.002869144780728909 %optimizer_step 0.05633856706908597 %forward: 23.33349085493244 %backward: 62.66469654766021 [2025-03-29 13:37:23,656] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17524.29 | forward: 145192.70 | backward_microstep: 389937.32 | backward: 389931.22 | backward_inner_microstep: 389916.78 | backward_inner: 389911.27 | backward_allreduce_microstep: 7.01 | backward_allreduce: 2.41 | reduce_tied_grads: 0.24 | comms: 17.85 | reduce_grads: 0.17 | step: 350.57 | _step_clipping: 0.13 | _step_step: 348.88 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.456 | iteration 6330/ 143000 | elapsed time per iteration (ms): 62225.5 | learning rate: 5.983E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.712390E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 13:47:51,335] [INFO] [logging.py:60:log_dist] [Rank 0] step=6340, skipped=0, lr=[0.0005982563441760843, 0.0005982563441760843], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6340 loss: 2.6977 iter time (s): 62.767 samples/sec: 16.314 %comms: 0.0028509604124839234 %optimizer_step 0.05540623682892973 %forward: 23.12885537047706 %backward: 62.14038031754926 [2025-03-29 13:47:51,336] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22849.68 | forward: 145173.87 | backward_microstep: 390047.67 | backward: 390039.18 | backward_inner_microstep: 390024.34 | backward_inner: 390018.71 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 347.77 | _step_clipping: 0.14 | _step_step: 346.06 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.314 | iteration 6340/ 143000 | elapsed time per iteration (ms): 62768.0 | learning rate: 5.983E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.703590E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 13:58:14,616] [INFO] [logging.py:60:log_dist] [Rank 0] step=6350, skipped=0, lr=[0.0005982492413989875, 0.0005982492413989875], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6350 loss: 2.6812 iter time (s): 62.327 samples/sec: 16.429 %comms: 0.00287429924099653 %optimizer_step 0.05591280900697846 %forward: 23.301754213937 %backward: 62.583641930994794 [2025-03-29 13:58:14,616] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18330.79 | forward: 145233.87 | backward_microstep: 390075.42 | backward: 390067.83 | backward_inner_microstep: 390050.32 | backward_inner: 390044.25 | backward_allreduce_microstep: 9.36 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 17.91 | reduce_grads: 0.18 | step: 348.49 | _step_clipping: 0.12 | _step_step: 346.79 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.429 | iteration 6350/ 143000 | elapsed time per iteration (ms): 62328.0 | learning rate: 5.982E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.698569E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 14:08:42,192] [INFO] [logging.py:60:log_dist] [Rank 0] step=6360, skipped=0, lr=[0.0005982421242270451, 0.0005982421242270451], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6360 loss: 2.7105 iter time (s): 62.757 samples/sec: 16.317 %comms: 0.002853291985656945 %optimizer_step 0.055975696400064855 %forward: 23.136375302843245 %backward: 62.16517248719155 [2025-03-29 14:08:42,192] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22607.77 | forward: 145197.14 | backward_microstep: 390137.90 | backward: 390130.47 | backward_inner_microstep: 390115.28 | backward_inner: 390109.41 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.50 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 351.29 | _step_clipping: 0.14 | _step_step: 349.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.317 | iteration 6360/ 143000 | elapsed time per iteration (ms): 62757.6 | learning rate: 5.982E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.697696E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 14:19:09,626] [INFO] [logging.py:60:log_dist] [Rank 0] step=6370, skipped=0, lr=[0.0005982349926606003, 0.0005982349926606003], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6370 loss: 2.6970 iter time (s): 62.743 samples/sec: 16.321 %comms: 0.002842803017713606 %optimizer_step 0.055262256058544665 %forward: 23.15578115983356 %backward: 62.16105419729817 [2025-03-29 14:19:09,626] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22491.58 | forward: 145286.09 | backward_microstep: 390023.09 | backward: 390016.48 | backward_inner_microstep: 390001.69 | backward_inner: 389995.90 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.43 | reduce_tied_grads: 0.24 | comms: 17.84 | reduce_grads: 0.17 | step: 346.73 | _step_clipping: 0.12 | _step_step: 345.10 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.320 | iteration 6370/ 143000 | elapsed time per iteration (ms): 62743.4 | learning rate: 5.982E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.696235E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 14:29:35,333] [INFO] [logging.py:60:log_dist] [Rank 0] step=6380, skipped=0, lr=[0.0005982278466999976, 0.0005982278466999976], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6380 loss: 2.7087 iter time (s): 62.570 samples/sec: 16.366 %comms: 0.0028617399332306914 %optimizer_step 0.05684670736040555 %forward: 23.201324511229867 %backward: 62.33336499665562 [2025-03-29 14:29:35,334] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20874.67 | forward: 145171.04 | backward_microstep: 390027.81 | backward: 390020.82 | backward_inner_microstep: 390005.77 | backward_inner: 389999.93 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 355.69 | _step_clipping: 0.12 | _step_step: 354.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.365 | iteration 6380/ 143000 | elapsed time per iteration (ms): 62570.7 | learning rate: 5.982E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.707391E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 14:40:01,620] [INFO] [logging.py:60:log_dist] [Rank 0] step=6390, skipped=0, lr=[0.0005982206863455815, 0.0005982206863455815], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6390 loss: 2.7166 iter time (s): 62.628 samples/sec: 16.350 %comms: 0.002851283525676393 %optimizer_step 0.055712065486197504 %forward: 23.177794037921686 %backward: 62.27282647003835 [2025-03-29 14:40:01,621] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21519.55 | forward: 145158.35 | backward_microstep: 390009.92 | backward: 390003.49 | backward_inner_microstep: 389988.47 | backward_inner: 389981.10 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.51 | reduce_tied_grads: 0.25 | comms: 17.86 | reduce_grads: 0.18 | step: 348.91 | _step_clipping: 0.12 | _step_step: 347.30 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.350 | iteration 6390/ 143000 | elapsed time per iteration (ms): 62628.7 | learning rate: 5.982E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.707357E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 14:50:24,253] [INFO] [logging.py:60:log_dist] [Rank 0] step=6400, skipped=0, lr=[0.0005982135115976979, 0.0005982135115976979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6400 loss: 2.6949 iter time (s): 62.263 samples/sec: 16.446 %comms: 0.002867255645145145 %optimizer_step 0.055433941006629045 %forward: 23.31456846405252 %backward: 62.64588727337753 [2025-03-29 14:50:24,253] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17802.59 | forward: 145162.79 | backward_microstep: 390058.31 | backward: 390050.18 | backward_inner_microstep: 390035.33 | backward_inner: 390029.62 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.85 | reduce_grads: 0.18 | step: 345.15 | _step_clipping: 0.13 | _step_step: 343.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.446 | iteration 6400/ 143000 | elapsed time per iteration (ms): 62263.2 | learning rate: 5.982E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.698877E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 15:00:50,238] [INFO] [logging.py:60:log_dist] [Rank 0] step=6410, skipped=0, lr=[0.000598206322456693, 0.000598206322456693], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6410 loss: 2.6920 iter time (s): 62.598 samples/sec: 16.358 %comms: 0.0028478978370070247 %optimizer_step 0.05486167266633003 %forward: 23.18247173753621 %backward: 62.29610549458516 [2025-03-29 15:00:50,238] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21296.62 | forward: 145117.65 | backward_microstep: 389967.14 | backward: 389961.19 | backward_inner_microstep: 389946.33 | backward_inner: 389940.80 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.51 | reduce_tied_grads: 0.24 | comms: 17.83 | reduce_grads: 0.18 | step: 343.42 | _step_clipping: 0.12 | _step_step: 341.82 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.358 | iteration 6410/ 143000 | elapsed time per iteration (ms): 62598.5 | learning rate: 5.982E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.693192E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 15:11:17,443] [INFO] [logging.py:60:log_dist] [Rank 0] step=6420, skipped=0, lr=[0.0005981991189229138, 0.0005981991189229138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6420 loss: 2.6906 iter time (s): 62.720 samples/sec: 16.327 %comms: 0.0028460081799708695 %optimizer_step 0.05733123804222022 %forward: 23.158060590312555 %backward: 62.19599692636134 [2025-03-29 15:11:17,444] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22205.58 | forward: 145247.33 | backward_microstep: 390100.21 | backward: 390093.23 | backward_inner_microstep: 390076.51 | backward_inner: 390070.86 | backward_allreduce_microstep: 8.95 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 359.58 | _step_clipping: 0.14 | _step_step: 357.96 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.326 | iteration 6420/ 143000 | elapsed time per iteration (ms): 62720.5 | learning rate: 5.982E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.697887E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 15:21:43,072] [INFO] [logging.py:60:log_dist] [Rank 0] step=6430, skipped=0, lr=[0.0005981919009967079, 0.0005981919009967079], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6430 loss: 2.7023 iter time (s): 62.562 samples/sec: 16.368 %comms: 0.0028778733293137793 %optimizer_step 0.05625097756593759 %forward: 23.235598849114382 %backward: 62.36046682590845 [2025-03-29 15:21:43,073] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20469.91 | forward: 145367.40 | backward_microstep: 390149.61 | backward: 390141.82 | backward_inner_microstep: 390126.65 | backward_inner: 390120.74 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.18 | step: 351.92 | _step_clipping: 0.14 | _step_step: 350.18 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.368 | iteration 6430/ 143000 | elapsed time per iteration (ms): 62562.9 | learning rate: 5.982E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.692663E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 15:32:09,711] [INFO] [logging.py:60:log_dist] [Rank 0] step=6440, skipped=0, lr=[0.0005981846686784238, 0.0005981846686784238], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6440 loss: 2.7010 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.002883394793548029 %optimizer_step 0.056736415989512755 %forward: 23.180392546190543 %backward: 62.23684361209594 [2025-03-29 15:32:09,712] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21748.15 | forward: 145256.07 | backward_microstep: 390003.54 | backward: 389996.82 | backward_inner_microstep: 389981.98 | backward_inner: 389976.27 | backward_allreduce_microstep: 7.11 | backward_allreduce: 2.44 | reduce_tied_grads: 0.28 | comms: 18.07 | reduce_grads: 0.19 | step: 355.53 | _step_clipping: 0.14 | _step_step: 353.80 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.341 | iteration 6440/ 143000 | elapsed time per iteration (ms): 62663.9 | learning rate: 5.982E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.691553E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 15:42:31,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=6450, skipped=0, lr=[0.0005981774219684105, 0.0005981774219684105], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6450 loss: 2.7011 iter time (s): 62.188 samples/sec: 16.466 %comms: 0.0028773918828012813 %optimizer_step 0.056563377964524125 %forward: 23.341235566765846 %backward: 62.719154306875815 [2025-03-29 15:42:31,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17034.90 | forward: 145153.39 | backward_microstep: 390042.20 | backward: 390034.97 | backward_inner_microstep: 390019.61 | backward_inner: 390013.73 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.53 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.18 | step: 351.75 | _step_clipping: 0.12 | _step_step: 350.08 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.466 | iteration 6450/ 143000 | elapsed time per iteration (ms): 62188.1 | learning rate: 5.982E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.692610E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 15:52:58,142] [INFO] [logging.py:60:log_dist] [Rank 0] step=6460, skipped=0, lr=[0.0005981701608670177, 0.0005981701608670177], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6460 loss: 2.7070 iter time (s): 62.654 samples/sec: 16.344 %comms: 0.0028912606878440383 %optimizer_step 0.056279808662714904 %forward: 23.194791863672172 %backward: 62.27381502679597 [2025-03-29 15:52:58,143] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21378.89 | forward: 145325.76 | backward_microstep: 390180.40 | backward: 390173.35 | backward_inner_microstep: 390158.46 | backward_inner: 390152.72 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.44 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.18 | step: 352.62 | _step_clipping: 0.13 | _step_step: 350.85 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.343 | iteration 6460/ 143000 | elapsed time per iteration (ms): 62655.0 | learning rate: 5.982E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.693870E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 16:03:25,688] [INFO] [logging.py:60:log_dist] [Rank 0] step=6470, skipped=0, lr=[0.0005981628853745959, 0.0005981628853745959], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6470 loss: 2.6960 iter time (s): 62.754 samples/sec: 16.318 %comms: 0.002866614733748159 %optimizer_step 0.05629285619088568 %forward: 23.146781568734852 %backward: 62.17353509265272 [2025-03-29 16:03:25,689] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22473.46 | forward: 145255.35 | backward_microstep: 390170.89 | backward: 390163.90 | backward_inner_microstep: 390147.19 | backward_inner: 390141.39 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.18 | step: 353.26 | _step_clipping: 0.13 | _step_step: 351.54 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.318 | iteration 6470/ 143000 | elapsed time per iteration (ms): 62754.6 | learning rate: 5.982E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.696051E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 16:13:47,659] [INFO] [logging.py:60:log_dist] [Rank 0] step=6480, skipped=0, lr=[0.0005981555954914962, 0.0005981555954914962], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6480 loss: 2.7028 iter time (s): 62.197 samples/sec: 16.464 %comms: 0.0028951059706132107 %optimizer_step 0.05639007101754937 %forward: 23.349705426887045 %backward: 62.72619801717008 [2025-03-29 16:13:47,660] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16941.55 | forward: 145227.14 | backward_microstep: 390142.86 | backward: 390135.37 | backward_inner_microstep: 390120.33 | backward_inner: 390114.49 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.18 | step: 350.73 | _step_clipping: 0.12 | _step_step: 349.02 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.464 | iteration 6480/ 143000 | elapsed time per iteration (ms): 62197.1 | learning rate: 5.982E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.697341E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 16:24:09,881] [INFO] [logging.py:60:log_dist] [Rank 0] step=6490, skipped=0, lr=[0.0005981482912180706, 0.0005981482912180706], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6490 loss: 2.6952 iter time (s): 62.222 samples/sec: 16.457 %comms: 0.0028937100623831146 %optimizer_step 0.05706520085640852 %forward: 23.34495624945196 %backward: 62.69590392982218 [2025-03-29 16:24:09,882] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17197.50 | forward: 145256.10 | backward_microstep: 390111.43 | backward: 390104.07 | backward_inner_microstep: 390088.95 | backward_inner: 390083.11 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.20 | step: 355.07 | _step_clipping: 0.12 | _step_step: 353.44 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.457 | iteration 6490/ 143000 | elapsed time per iteration (ms): 62222.2 | learning rate: 5.981E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.694607E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 16:34:28,003] [INFO] [logging.py:60:log_dist] [Rank 0] step=6500, skipped=0, lr=[0.0005981409725546714, 0.0005981409725546714], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6500 loss: 2.6879 iter time (s): 61.811 samples/sec: 16.567 %comms: 0.0028916436184408902 %optimizer_step 0.058043322401261595 %forward: 23.49182619711288 %backward: 63.09842714536708 [2025-03-29 16:34:28,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13267.98 | forward: 145205.22 | backward_microstep: 390023.84 | backward: 390017.41 | backward_inner_microstep: 390002.57 | backward_inner: 389997.12 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.17 | step: 358.77 | _step_clipping: 0.13 | _step_step: 357.07 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.566 | iteration 6500/ 143000 | elapsed time per iteration (ms): 61812.1 | learning rate: 5.981E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.694407E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 16:44:50,204] [INFO] [logging.py:60:log_dist] [Rank 0] step=6510, skipped=0, lr=[0.000598133639501652, 0.000598133639501652], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6510 loss: 2.7120 iter time (s): 62.220 samples/sec: 16.458 %comms: 0.003048958050333505 %optimizer_step 0.057171603739032496 %forward: 23.33761383904095 %backward: 62.68837444124452 [2025-03-29 16:44:50,204] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17330.36 | forward: 145205.66 | backward_microstep: 390051.14 | backward: 390044.44 | backward_inner_microstep: 390029.23 | backward_inner: 390023.73 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 18.97 | reduce_grads: 0.19 | step: 355.72 | _step_clipping: 0.12 | _step_step: 354.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.458 | iteration 6510/ 143000 | elapsed time per iteration (ms): 62220.1 | learning rate: 5.981E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.705216E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 16:55:12,859] [INFO] [logging.py:60:log_dist] [Rank 0] step=6520, skipped=0, lr=[0.0005981262920593663, 0.0005981262920593663], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6520 loss: 2.6897 iter time (s): 62.265 samples/sec: 16.446 %comms: 0.002901266991401339 %optimizer_step 0.05622470790374841 %forward: 23.326644494047 %backward: 62.65613938019759 [2025-03-29 16:55:12,860] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17658.03 | forward: 145243.34 | backward_microstep: 390139.55 | backward: 390128.42 | backward_inner_microstep: 390113.16 | backward_inner: 390105.44 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 18.06 | reduce_grads: 0.18 | step: 350.08 | _step_clipping: 0.13 | _step_step: 348.27 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.446 | iteration 6520/ 143000 | elapsed time per iteration (ms): 62265.6 | learning rate: 5.981E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.699879E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 17:05:39,467] [INFO] [logging.py:60:log_dist] [Rank 0] step=6530, skipped=0, lr=[0.0005981189302281689, 0.0005981189302281689], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6530 loss: 2.6915 iter time (s): 62.660 samples/sec: 16.342 %comms: 0.0028488330772282926 %optimizer_step 0.05485666431510852 %forward: 23.16529796278924 %backward: 62.24683619530791 [2025-03-29 17:05:39,468] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21814.00 | forward: 145154.47 | backward_microstep: 390046.56 | backward: 390040.59 | backward_inner_microstep: 390024.22 | backward_inner: 390018.68 | backward_allreduce_microstep: 7.08 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 343.73 | _step_clipping: 0.11 | _step_step: 342.13 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.342 | iteration 6530/ 143000 | elapsed time per iteration (ms): 62660.8 | learning rate: 5.981E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.696227E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 17:16:06,079] [INFO] [logging.py:60:log_dist] [Rank 0] step=6540, skipped=0, lr=[0.0005981115540084152, 0.0005981115540084152], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6540 loss: 2.6556 iter time (s): 62.661 samples/sec: 16.342 %comms: 0.0028591689704390967 %optimizer_step 0.05917639642506533 %forward: 23.178436591126406 %backward: 62.25856128425505 [2025-03-29 17:16:06,080] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21606.50 | forward: 145237.49 | backward_microstep: 390122.88 | backward: 390115.91 | backward_inner_microstep: 390100.93 | backward_inner: 390095.14 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.45 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 370.80 | _step_clipping: 0.13 | _step_step: 369.15 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.342 | iteration 6540/ 143000 | elapsed time per iteration (ms): 62661.2 | learning rate: 5.981E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.684268E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 17:26:34,243] [INFO] [logging.py:60:log_dist] [Rank 0] step=6550, skipped=0, lr=[0.000598104163400461, 0.000598104163400461], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6550 loss: 2.6756 iter time (s): 62.816 samples/sec: 16.302 %comms: 0.0028922282782153017 %optimizer_step 0.05598684567772649 %forward: 23.14838890710756 %backward: 62.11264874603625 [2025-03-29 17:26:34,244] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22953.31 | forward: 145408.22 | backward_microstep: 390172.22 | backward: 390164.95 | backward_inner_microstep: 390148.03 | backward_inner: 390142.21 | backward_allreduce_microstep: 9.05 | backward_allreduce: 2.52 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.20 | step: 351.69 | _step_clipping: 0.12 | _step_step: 350.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.301 | iteration 6550/ 143000 | elapsed time per iteration (ms): 62816.3 | learning rate: 5.981E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.680448E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 17:36:55,012] [INFO] [logging.py:60:log_dist] [Rank 0] step=6560, skipped=0, lr=[0.0005980967584046633, 0.0005980967584046633], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6560 loss: 2.6950 iter time (s): 62.076 samples/sec: 16.496 %comms: 0.0028726722357985545 %optimizer_step 0.05525220848560378 %forward: 23.385975460131654 %backward: 62.833783447408706 [2025-03-29 17:36:55,013] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15964.07 | forward: 145171.74 | backward_microstep: 390055.33 | backward: 390049.56 | backward_inner_microstep: 390032.94 | backward_inner: 390027.30 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.19 | step: 342.99 | _step_clipping: 0.12 | _step_step: 341.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.496 | iteration 6560/ 143000 | elapsed time per iteration (ms): 62076.9 | learning rate: 5.981E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.684097E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 17:47:22,712] [INFO] [logging.py:60:log_dist] [Rank 0] step=6570, skipped=0, lr=[0.0005980893390213792, 0.0005980893390213792], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6570 loss: 2.6962 iter time (s): 62.769 samples/sec: 16.314 %comms: 0.0028601382026680047 %optimizer_step 0.05609505873590448 %forward: 23.129964233246294 %backward: 62.14637741246049 [2025-03-29 17:47:22,712] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22835.61 | forward: 145185.42 | backward_microstep: 390096.39 | backward: 390089.15 | backward_inner_microstep: 390074.07 | backward_inner: 390068.22 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.17 | step: 352.11 | _step_clipping: 0.13 | _step_step: 350.35 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.314 | iteration 6570/ 143000 | elapsed time per iteration (ms): 62770.0 | learning rate: 5.981E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.690494E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 17:57:51,628] [INFO] [logging.py:60:log_dist] [Rank 0] step=6580, skipped=0, lr=[0.000598081905250967, 0.000598081905250967], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6580 loss: 2.6841 iter time (s): 62.891 samples/sec: 16.282 %comms: 0.002872274387375198 %optimizer_step 0.05617232481128444 %forward: 23.10968329099196 %backward: 62.031732072825804 [2025-03-29 17:57:51,628] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23814.89 | forward: 145339.12 | backward_microstep: 390131.84 | backward: 390123.79 | backward_inner_microstep: 390108.58 | backward_inner: 390102.65 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 18.06 | reduce_grads: 0.18 | step: 353.27 | _step_clipping: 0.12 | _step_step: 351.57 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.282 | iteration 6580/ 143000 | elapsed time per iteration (ms): 62891.6 | learning rate: 5.981E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.696097E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 18:08:14,051] [INFO] [logging.py:60:log_dist] [Rank 0] step=6590, skipped=0, lr=[0.0005980744570937853, 0.0005980744570937853], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6590 loss: 2.6866 iter time (s): 62.242 samples/sec: 16.452 %comms: 0.0028726235208463386 %optimizer_step 0.05752505881280377 %forward: 23.322026990328347 %backward: 62.667885870301696 [2025-03-29 18:08:14,051] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17597.94 | forward: 145160.48 | backward_microstep: 390062.32 | backward: 390056.17 | backward_inner_microstep: 390039.80 | backward_inner: 390034.33 | backward_allreduce_microstep: 8.85 | backward_allreduce: 4.23 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 358.05 | _step_clipping: 0.13 | _step_step: 356.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.452 | iteration 6590/ 143000 | elapsed time per iteration (ms): 62242.3 | learning rate: 5.981E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.682310E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 18:18:36,469] [INFO] [logging.py:60:log_dist] [Rank 0] step=6600, skipped=0, lr=[0.0005980669945501939, 0.0005980669945501939], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6600 loss: 2.6739 iter time (s): 62.241 samples/sec: 16.452 %comms: 0.0028666302523621258 %optimizer_step 0.056409365933147264 %forward: 23.332020671588072 %backward: 62.6685697364162 [2025-03-29 18:18:36,470] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17572.40 | forward: 145221.64 | backward_microstep: 390063.42 | backward: 390057.62 | backward_inner_microstep: 390043.13 | backward_inner: 390037.77 | backward_allreduce_microstep: 7.06 | backward_allreduce: 2.42 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.17 | step: 351.10 | _step_clipping: 0.12 | _step_step: 349.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.452 | iteration 6600/ 143000 | elapsed time per iteration (ms): 62241.9 | learning rate: 5.981E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.678783E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 18:28:58,932] [INFO] [logging.py:60:log_dist] [Rank 0] step=6610, skipped=0, lr=[0.0005980595176205526, 0.0005980595176205526], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6610 loss: 2.7139 iter time (s): 62.246 samples/sec: 16.451 %comms: 0.002908487283193022 %optimizer_step 0.05636472468221228 %forward: 23.31986452275887 %backward: 62.66078992844608 [2025-03-29 18:28:58,932] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17698.96 | forward: 145156.08 | backward_microstep: 390043.23 | backward: 390036.34 | backward_inner_microstep: 390021.34 | backward_inner: 390015.67 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 18.10 | reduce_grads: 0.18 | step: 350.85 | _step_clipping: 0.12 | _step_step: 349.17 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.451 | iteration 6610/ 143000 | elapsed time per iteration (ms): 62246.2 | learning rate: 5.981E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.697715E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 18:39:26,731] [INFO] [logging.py:60:log_dist] [Rank 0] step=6620, skipped=0, lr=[0.0005980520263052224, 0.0005980520263052224], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6620 loss: 2.7141 iter time (s): 62.779 samples/sec: 16.311 %comms: 0.0028477578181135693 %optimizer_step 0.05539786357308987 %forward: 23.132681133725526 %backward: 62.1369570109861 [2025-03-29 18:39:26,731] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22887.22 | forward: 145225.61 | backward_microstep: 390098.83 | backward: 390092.15 | backward_inner_microstep: 390071.59 | backward_inner: 390065.95 | backward_allreduce_microstep: 9.19 | backward_allreduce: 4.47 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 347.78 | _step_clipping: 0.11 | _step_step: 345.95 | _step_zero_grad: 0.47 | _step_check_overflow: 0.72 samples/sec: 16.311 | iteration 6620/ 143000 | elapsed time per iteration (ms): 62779.9 | learning rate: 5.981E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.695989E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 18:49:48,879] [INFO] [logging.py:60:log_dist] [Rank 0] step=6630, skipped=0, lr=[0.0005980445206045649, 0.0005980445206045649], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6630 loss: 2.6780 iter time (s): 62.214 samples/sec: 16.459 %comms: 0.0028830141349560235 %optimizer_step 0.056441178265293705 %forward: 23.345263847047313 %backward: 62.71419513434013 [2025-03-29 18:49:48,880] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17139.36 | forward: 145240.90 | backward_microstep: 390178.76 | backward: 390171.92 | backward_inner_microstep: 390155.25 | backward_inner: 390149.45 | backward_allreduce_microstep: 8.87 | backward_allreduce: 4.19 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 351.14 | _step_clipping: 0.14 | _step_step: 349.44 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.459 | iteration 6630/ 143000 | elapsed time per iteration (ms): 62214.8 | learning rate: 5.980E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.695679E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 19:00:15,971] [INFO] [logging.py:60:log_dist] [Rank 0] step=6640, skipped=0, lr=[0.0005980370005189426, 0.0005980370005189426], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6640 loss: 2.6734 iter time (s): 62.709 samples/sec: 16.329 %comms: 0.0028533678546702455 %optimizer_step 0.05571969787107494 %forward: 23.15351949302284 %backward: 62.20271335128543 [2025-03-29 19:00:15,971] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22292.97 | forward: 145192.52 | backward_microstep: 390073.34 | backward: 390064.63 | backward_inner_microstep: 390049.89 | backward_inner: 390044.31 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.41 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 349.41 | _step_clipping: 0.12 | _step_step: 347.72 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.329 | iteration 6640/ 143000 | elapsed time per iteration (ms): 62709.2 | learning rate: 5.980E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.685649E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 19:10:39,113] [INFO] [logging.py:60:log_dist] [Rank 0] step=6650, skipped=0, lr=[0.000598029466048718, 0.000598029466048718], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6650 loss: 2.6813 iter time (s): 62.314 samples/sec: 16.433 %comms: 0.002871259825953568 %optimizer_step 0.05766471049101408 %forward: 23.312659956498326 %backward: 62.61766162132704 [2025-03-29 19:10:39,114] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18088.71 | forward: 145269.82 | backward_microstep: 390201.53 | backward: 390193.85 | backward_inner_microstep: 390178.55 | backward_inner: 390172.72 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 359.33 | _step_clipping: 0.14 | _step_step: 357.62 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.433 | iteration 6650/ 143000 | elapsed time per iteration (ms): 62314.3 | learning rate: 5.980E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.677960E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 19:21:04,832] [INFO] [logging.py:60:log_dist] [Rank 0] step=6660, skipped=0, lr=[0.000598021917194255, 0.000598021917194255], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6660 loss: 2.7065 iter time (s): 62.571 samples/sec: 16.365 %comms: 0.0028549390244126645 %optimizer_step 0.05543289769526659 %forward: 23.21133643452352 %backward: 62.3271863649048 [2025-03-29 19:21:04,833] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20970.17 | forward: 145236.56 | backward_microstep: 389996.25 | backward: 389989.87 | backward_inner_microstep: 389975.13 | backward_inner: 389967.21 | backward_allreduce_microstep: 7.11 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 346.85 | _step_clipping: 0.12 | _step_step: 345.16 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.365 | iteration 6660/ 143000 | elapsed time per iteration (ms): 62571.9 | learning rate: 5.980E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.690165E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 19:31:27,185] [INFO] [logging.py:60:log_dist] [Rank 0] step=6670, skipped=0, lr=[0.000598014353955918, 0.000598014353955918], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6670 loss: 2.7076 iter time (s): 62.235 samples/sec: 16.454 %comms: 0.002863179399639289 %optimizer_step 0.0562966931753235 %forward: 23.30648323876801 %backward: 62.62381101258626 [2025-03-29 19:31:27,186] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18085.19 | forward: 145047.33 | backward_microstep: 389743.63 | backward: 389737.77 | backward_inner_microstep: 389723.33 | backward_inner: 389718.07 | backward_allreduce_microstep: 6.99 | backward_allreduce: 2.41 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.17 | step: 350.36 | _step_clipping: 0.11 | _step_step: 348.62 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.454 | iteration 6670/ 143000 | elapsed time per iteration (ms): 62235.3 | learning rate: 5.980E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.694389E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 19:41:49,580] [INFO] [logging.py:60:log_dist] [Rank 0] step=6680, skipped=0, lr=[0.0005980067763340718, 0.0005980067763340718], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6680 loss: 2.7001 iter time (s): 62.239 samples/sec: 16.453 %comms: 0.0028752851334639218 %optimizer_step 0.05652906569732753 %forward: 23.332500702590607 %backward: 62.688317031382034 [2025-03-29 19:41:49,581] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17460.87 | forward: 145218.93 | backward_microstep: 390172.24 | backward: 390165.23 | backward_inner_microstep: 390148.62 | backward_inner: 390142.88 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 351.83 | _step_clipping: 0.14 | _step_step: 350.15 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.453 | iteration 6680/ 143000 | elapsed time per iteration (ms): 62239.5 | learning rate: 5.980E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.694502E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 19:52:14,889] [INFO] [logging.py:60:log_dist] [Rank 0] step=6690, skipped=0, lr=[0.0005979991843290824, 0.0005979991843290824], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6690 loss: 2.6749 iter time (s): 62.530 samples/sec: 16.376 %comms: 0.0028553246511754077 %optimizer_step 0.055826063677483176 %forward: 23.231076697236645 %backward: 62.37838964815662 [2025-03-29 19:52:14,890] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20466.85 | forward: 145264.79 | backward_microstep: 390061.49 | backward: 390054.41 | backward_inner_microstep: 390039.31 | backward_inner: 390033.57 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.85 | reduce_grads: 0.18 | step: 349.08 | _step_clipping: 0.12 | _step_step: 347.46 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.376 | iteration 6690/ 143000 | elapsed time per iteration (ms): 62530.9 | learning rate: 5.980E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.687255E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 20:02:42,416] [INFO] [logging.py:60:log_dist] [Rank 0] step=6700, skipped=0, lr=[0.000597991577941316, 0.000597991577941316], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6700 loss: 2.6742 iter time (s): 62.752 samples/sec: 16.318 %comms: 0.0028851650390028754 %optimizer_step 0.055522386829613156 %forward: 23.14950934008656 %backward: 62.1742435269892 [2025-03-29 20:02:42,416] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22557.89 | forward: 145268.14 | backward_microstep: 390164.02 | backward: 390156.73 | backward_inner_microstep: 390141.46 | backward_inner: 390135.56 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 18.11 | reduce_grads: 0.39 | step: 348.41 | _step_clipping: 0.16 | _step_step: 346.57 | _step_zero_grad: 0.46 | _step_check_overflow: 0.68 samples/sec: 16.318 | iteration 6700/ 143000 | elapsed time per iteration (ms): 62752.7 | learning rate: 5.980E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.682044E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 20:13:04,961] [INFO] [logging.py:60:log_dist] [Rank 0] step=6710, skipped=0, lr=[0.0005979839571711398, 0.0005979839571711398], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6710 loss: 2.6782 iter time (s): 62.254 samples/sec: 16.449 %comms: 0.0028668891795680866 %optimizer_step 0.057514221092367236 %forward: 23.320219935537743 %backward: 62.65370425474235 [2025-03-29 20:13:04,962] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17797.62 | forward: 145177.75 | backward_microstep: 390050.10 | backward: 390044.50 | backward_inner_microstep: 390028.28 | backward_inner: 390022.79 | backward_allreduce_microstep: 8.79 | backward_allreduce: 2.44 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.19 | step: 358.05 | _step_clipping: 0.13 | _step_step: 356.40 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.449 | iteration 6710/ 143000 | elapsed time per iteration (ms): 62254.5 | learning rate: 5.980E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.675722E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 20:23:27,443] [INFO] [logging.py:60:log_dist] [Rank 0] step=6720, skipped=0, lr=[0.0005979763220189217, 0.0005979763220189217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6720 loss: 2.6588 iter time (s): 62.248 samples/sec: 16.450 %comms: 0.0028898974503701546 %optimizer_step 0.05534564785622684 %forward: 23.334866509898795 %backward: 62.67350621447453 [2025-03-29 20:23:27,444] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17553.93 | forward: 145253.96 | backward_microstep: 390134.35 | backward: 390127.58 | backward_inner_microstep: 390112.61 | backward_inner: 390106.84 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.46 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.18 | step: 344.51 | _step_clipping: 0.12 | _step_step: 342.84 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.450 | iteration 6720/ 143000 | elapsed time per iteration (ms): 62248.2 | learning rate: 5.980E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.674244E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 20:33:55,987] [INFO] [logging.py:60:log_dist] [Rank 0] step=6730, skipped=0, lr=[0.0005979686724850302, 0.0005979686724850302], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6730 loss: 2.6710 iter time (s): 62.854 samples/sec: 16.292 %comms: 0.0028885803164448364 %optimizer_step 0.05614469125526252 %forward: 23.127859209359546 %backward: 62.07246443301916 [2025-03-29 20:33:55,988] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23474.66 | forward: 145367.29 | backward_microstep: 390156.38 | backward: 390148.77 | backward_inner_microstep: 390131.88 | backward_inner: 390124.40 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 18.16 | reduce_grads: 0.18 | step: 352.89 | _step_clipping: 0.13 | _step_step: 351.09 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.292 | iteration 6730/ 143000 | elapsed time per iteration (ms): 62854.4 | learning rate: 5.980E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.674373E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 20:44:22,795] [INFO] [logging.py:60:log_dist] [Rank 0] step=6740, skipped=0, lr=[0.0005979610085698342, 0.0005979610085698342], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6740 loss: 2.6882 iter time (s): 62.680 samples/sec: 16.337 %comms: 0.002857401146814019 %optimizer_step 0.05586966485076335 %forward: 23.16322880563133 %backward: 62.2241647118292 [2025-03-29 20:44:22,795] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22116.29 | forward: 145187.54 | backward_microstep: 390028.79 | backward: 390022.19 | backward_inner_microstep: 390007.28 | backward_inner: 390001.69 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 350.19 | _step_clipping: 0.13 | _step_step: 348.44 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.337 | iteration 6740/ 143000 | elapsed time per iteration (ms): 62680.7 | learning rate: 5.980E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.680488E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 20:54:49,729] [INFO] [logging.py:60:log_dist] [Rank 0] step=6750, skipped=0, lr=[0.000597953330273704, 0.000597953330273704], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6750 loss: 2.6632 iter time (s): 62.693 samples/sec: 16.334 %comms: 0.0028713861838971628 %optimizer_step 0.055910228937319596 %forward: 23.17268832231616 %backward: 62.23097874496764 [2025-03-29 20:54:49,730] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21996.31 | forward: 145276.33 | backward_microstep: 390152.84 | backward: 390144.13 | backward_inner_microstep: 390124.67 | backward_inner: 390118.97 | backward_allreduce_microstep: 11.69 | backward_allreduce: 2.45 | reduce_tied_grads: 0.27 | comms: 18.00 | reduce_grads: 0.18 | step: 350.52 | _step_clipping: 0.13 | _step_step: 348.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.333 | iteration 6750/ 143000 | elapsed time per iteration (ms): 62693.4 | learning rate: 5.980E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.678009E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 21:05:12,091] [INFO] [logging.py:60:log_dist] [Rank 0] step=6760, skipped=0, lr=[0.00059794563759701, 0.00059794563759701], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6760 loss: 2.6880 iter time (s): 62.236 samples/sec: 16.454 %comms: 0.0028737130837308044 %optimizer_step 0.05585900001062769 %forward: 23.33527611927012 %backward: 62.68493398532768 [2025-03-29 21:05:12,091] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17519.57 | forward: 145228.54 | backward_microstep: 390130.38 | backward: 390123.57 | backward_inner_microstep: 390104.98 | backward_inner: 390097.52 | backward_allreduce_microstep: 10.79 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 347.64 | _step_clipping: 0.13 | _step_step: 345.99 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.453 | iteration 6760/ 143000 | elapsed time per iteration (ms): 62236.2 | learning rate: 5.979E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.674382E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 21:15:35,068] [INFO] [logging.py:60:log_dist] [Rank 0] step=6770, skipped=0, lr=[0.0005979379305401235, 0.0005979379305401235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6770 loss: 2.6568 iter time (s): 62.297 samples/sec: 16.437 %comms: 0.0029078807790036904 %optimizer_step 0.059039268223575574 %forward: 23.317301605629766 %backward: 62.630500564662405 [2025-03-29 21:15:35,069] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17984.70 | forward: 145260.25 | backward_microstep: 390177.82 | backward: 390170.46 | backward_inner_microstep: 390153.76 | backward_inner: 390147.89 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.12 | reduce_grads: 0.18 | step: 367.80 | _step_clipping: 0.14 | _step_step: 366.09 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.437 | iteration 6770/ 143000 | elapsed time per iteration (ms): 62297.7 | learning rate: 5.979E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.668742E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 21:25:58,099] [INFO] [logging.py:60:log_dist] [Rank 0] step=6780, skipped=0, lr=[0.0005979302091034164, 0.0005979302091034164], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6780 loss: 2.6589 iter time (s): 62.303 samples/sec: 16.436 %comms: 0.0028834828985835627 %optimizer_step 0.05647845617117877 %forward: 23.310361192082375 %backward: 62.61308425341443 [2025-03-29 21:25:58,100] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18198.62 | forward: 145229.54 | backward_microstep: 390105.29 | backward: 390095.62 | backward_inner_microstep: 390080.38 | backward_inner: 390074.54 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.18 | step: 351.88 | _step_clipping: 0.12 | _step_step: 350.27 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.436 | iteration 6780/ 143000 | elapsed time per iteration (ms): 62303.1 | learning rate: 5.979E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.662531E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 21:36:19,478] [INFO] [logging.py:60:log_dist] [Rank 0] step=6790, skipped=0, lr=[0.0005979224732872616, 0.0005979224732872616], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6790 loss: 2.6673 iter time (s): 62.137 samples/sec: 16.480 %comms: 0.003395593992237022 %optimizer_step 0.05562536018617373 %forward: 23.354192355789554 %backward: 62.75009060356203 [2025-03-29 21:36:19,479] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16888.43 | forward: 145116.82 | backward_microstep: 389920.01 | backward: 389912.58 | backward_inner_microstep: 389898.03 | backward_inner: 389892.64 | backward_allreduce_microstep: 7.06 | backward_allreduce: 2.43 | reduce_tied_grads: 1.81 | comms: 21.10 | reduce_grads: 0.17 | step: 345.64 | _step_clipping: 0.12 | _step_step: 344.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.479 | iteration 6790/ 143000 | elapsed time per iteration (ms): 62137.9 | learning rate: 5.979E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.666283E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 21:46:41,418] [INFO] [logging.py:60:log_dist] [Rank 0] step=6800, skipped=0, lr=[0.0005979147230920322, 0.0005979147230920322], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6800 loss: 2.6555 iter time (s): 62.193 samples/sec: 16.465 %comms: 0.002876123540864748 %optimizer_step 0.0561706628517695 %forward: 23.344901316686375 %backward: 62.72764750615381 [2025-03-29 21:46:41,418] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17147.28 | forward: 145189.90 | backward_microstep: 390131.51 | backward: 390124.61 | backward_inner_microstep: 390109.40 | backward_inner: 390103.73 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 17.89 | reduce_grads: 0.18 | step: 349.34 | _step_clipping: 0.12 | _step_step: 347.67 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.465 | iteration 6800/ 143000 | elapsed time per iteration (ms): 62193.9 | learning rate: 5.979E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.662315E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 21:57:09,096] [INFO] [logging.py:60:log_dist] [Rank 0] step=6810, skipped=0, lr=[0.0005979069585181024, 0.0005979069585181024], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6810 loss: 2.6655 iter time (s): 62.767 samples/sec: 16.314 %comms: 0.002868100056523302 %optimizer_step 0.056878077855536474 %forward: 23.142266635996414 %backward: 62.16597292808611 [2025-03-29 21:57:09,096] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22710.51 | forward: 145257.61 | backward_microstep: 390206.39 | backward: 390198.64 | backward_inner_microstep: 390183.17 | backward_inner: 390177.31 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.19 | step: 357.01 | _step_clipping: 0.15 | _step_step: 355.18 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.314 | iteration 6810/ 143000 | elapsed time per iteration (ms): 62767.8 | learning rate: 5.979E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.666916E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 22:07:26,371] [INFO] [logging.py:60:log_dist] [Rank 0] step=6820, skipped=0, lr=[0.0005978991795658469, 0.0005978991795658469], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6820 loss: 2.6469 iter time (s): 61.727 samples/sec: 16.589 %comms: 0.0028915570435418055 %optimizer_step 0.05645177743176788 %forward: 23.51608118304702 %backward: 63.1949001605758 [2025-03-29 22:07:26,372] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12585.56 | forward: 145157.84 | backward_microstep: 390089.14 | backward: 390083.51 | backward_inner_microstep: 390069.16 | backward_inner: 390063.96 | backward_allreduce_microstep: 7.02 | backward_allreduce: 2.40 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.19 | step: 348.46 | _step_clipping: 0.14 | _step_step: 346.81 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.589 | iteration 6820/ 143000 | elapsed time per iteration (ms): 61727.6 | learning rate: 5.979E-04 | approx flops per GPU: 71.6TFLOPS | lm_loss: 2.675157E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 22:17:53,912] [INFO] [logging.py:60:log_dist] [Rank 0] step=6830, skipped=0, lr=[0.0005978913862356412, 0.0005978913862356412], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6830 loss: 2.6584 iter time (s): 62.754 samples/sec: 16.318 %comms: 0.002858772414136821 %optimizer_step 0.059478803141922526 %forward: 23.151126943510754 %backward: 62.18226186445737 [2025-03-29 22:17:53,913] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22518.51 | forward: 145281.49 | backward_microstep: 390223.95 | backward: 390215.64 | backward_inner_microstep: 390198.55 | backward_inner: 390192.66 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.18 | step: 373.25 | _step_clipping: 0.16 | _step_step: 371.55 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.318 | iteration 6830/ 143000 | elapsed time per iteration (ms): 62754.1 | learning rate: 5.979E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.659557E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 22:28:15,469] [INFO] [logging.py:60:log_dist] [Rank 0] step=6840, skipped=0, lr=[0.0005978835785278614, 0.0005978835785278614], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6840 loss: 2.6462 iter time (s): 62.155 samples/sec: 16.475 %comms: 0.002890399163111758 %optimizer_step 0.05849053664779109 %forward: 23.364895186466896 %backward: 62.76602700405515 [2025-03-29 22:28:15,469] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16731.43 | forward: 145224.84 | backward_microstep: 390130.67 | backward: 390123.13 | backward_inner_microstep: 390108.15 | backward_inner: 390102.43 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.18 | step: 363.55 | _step_clipping: 0.14 | _step_step: 361.78 | _step_zero_grad: 0.46 | _step_check_overflow: 0.62 samples/sec: 16.475 | iteration 6840/ 143000 | elapsed time per iteration (ms): 62155.7 | learning rate: 5.979E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.662267E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 22:38:41,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=6850, skipped=0, lr=[0.0005978757564428843, 0.0005978757564428843], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6850 loss: 2.6721 iter time (s): 62.561 samples/sec: 16.368 %comms: 0.002858136964565128 %optimizer_step 0.05528326757001427 %forward: 23.212397829778116 %backward: 62.34349121976338 [2025-03-29 22:38:41,081] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20931.02 | forward: 145218.17 | backward_microstep: 390031.79 | backward: 390024.66 | backward_inner_microstep: 390009.63 | backward_inner: 390003.89 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 345.86 | _step_clipping: 0.11 | _step_step: 344.23 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.368 | iteration 6850/ 143000 | elapsed time per iteration (ms): 62561.1 | learning rate: 5.979E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.673921E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 22:49:02,733] [INFO] [logging.py:60:log_dist] [Rank 0] step=6860, skipped=0, lr=[0.0005978679199810875, 0.0005978679199810875], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6860 loss: 2.6626 iter time (s): 62.165 samples/sec: 16.472 %comms: 0.0028779493075042776 %optimizer_step 0.05998475695926928 %forward: 23.347262064907223 %backward: 62.73347418889155 [2025-03-29 22:49:02,733] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17062.77 | forward: 145137.60 | backward_microstep: 389986.60 | backward: 389980.90 | backward_inner_microstep: 389964.58 | backward_inner: 389959.19 | backward_allreduce_microstep: 7.08 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.89 | reduce_grads: 0.18 | step: 372.89 | _step_clipping: 0.13 | _step_step: 371.19 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.472 | iteration 6860/ 143000 | elapsed time per iteration (ms): 62165.2 | learning rate: 5.979E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.670586E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 22:59:29,206] [INFO] [logging.py:60:log_dist] [Rank 0] step=6870, skipped=0, lr=[0.0005978600691428493, 0.0005978600691428493], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6870 loss: 2.6633 iter time (s): 62.647 samples/sec: 16.346 %comms: 0.0028694281288880624 %optimizer_step 0.055964942300862755 %forward: 23.1849860556666 %backward: 62.26763899765394 [2025-03-29 22:59:29,207] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21648.97 | forward: 145246.49 | backward_microstep: 390094.41 | backward: 390086.76 | backward_inner_microstep: 390068.30 | backward_inner: 390062.64 | backward_allreduce_microstep: 10.67 | backward_allreduce: 4.24 | reduce_tied_grads: 0.28 | comms: 17.98 | reduce_grads: 0.18 | step: 350.60 | _step_clipping: 0.13 | _step_step: 348.90 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.345 | iteration 6870/ 143000 | elapsed time per iteration (ms): 62647.4 | learning rate: 5.979E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.670230E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 23:09:57,509] [INFO] [logging.py:60:log_dist] [Rank 0] step=6880, skipped=0, lr=[0.0005978522039285485, 0.0005978522039285485], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6880 loss: 2.6893 iter time (s): 62.830 samples/sec: 16.298 %comms: 0.0028446031975103707 %optimizer_step 0.055848641806990854 %forward: 23.12894258092761 %backward: 62.097807106274374 [2025-03-29 23:09:57,510] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23335.29 | forward: 145318.58 | backward_microstep: 390167.44 | backward: 390159.00 | backward_inner_microstep: 390143.87 | backward_inner: 390138.03 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 350.90 | _step_clipping: 0.13 | _step_step: 349.21 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.298 | iteration 6880/ 143000 | elapsed time per iteration (ms): 62830.3 | learning rate: 5.979E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.662178E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 23:20:20,150] [INFO] [logging.py:60:log_dist] [Rank 0] step=6890, skipped=0, lr=[0.0005978443243385646, 0.0005978443243385646], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6890 loss: 2.6800 iter time (s): 62.264 samples/sec: 16.446 %comms: 0.0028804244253830885 %optimizer_step 0.056178902270776085 %forward: 23.322149785453583 %backward: 62.641221142622996 [2025-03-29 23:20:20,151] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17941.24 | forward: 145212.11 | backward_microstep: 390032.88 | backward: 390026.82 | backward_inner_microstep: 390009.70 | backward_inner: 390004.32 | backward_allreduce_microstep: 9.61 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.93 | reduce_grads: 0.20 | step: 349.79 | _step_clipping: 0.14 | _step_step: 348.06 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.446 | iteration 6890/ 143000 | elapsed time per iteration (ms): 62264.1 | learning rate: 5.978E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.677761E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 23:30:45,163] [INFO] [logging.py:60:log_dist] [Rank 0] step=6900, skipped=0, lr=[0.0005978364303732782, 0.0005978364303732782], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6900 loss: 2.6646 iter time (s): 62.501 samples/sec: 16.384 %comms: 0.0028731973108702675 %optimizer_step 0.05521894610934193 %forward: 23.22661303876692 %backward: 62.39517550926345 [2025-03-29 23:30:45,163] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20419.28 | forward: 145167.98 | backward_microstep: 389980.13 | backward: 389974.28 | backward_inner_microstep: 389959.49 | backward_inner: 389953.99 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 17.96 | reduce_grads: 0.17 | step: 345.12 | _step_clipping: 0.11 | _step_step: 343.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.384 | iteration 6900/ 143000 | elapsed time per iteration (ms): 62501.2 | learning rate: 5.978E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.672072E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 23:41:11,525] [INFO] [logging.py:60:log_dist] [Rank 0] step=6910, skipped=0, lr=[0.00059782852203307, 0.00059782852203307], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6910 loss: 2.6619 iter time (s): 62.636 samples/sec: 16.349 %comms: 0.002855929772555571 %optimizer_step 0.05521602527395798 %forward: 23.183323978601738 %backward: 62.268920751329304 [2025-03-29 23:41:11,525] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21667.18 | forward: 145210.30 | backward_microstep: 390031.81 | backward: 390025.56 | backward_inner_microstep: 390010.48 | backward_inner: 390004.75 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 345.85 | _step_clipping: 0.14 | _step_step: 344.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.348 | iteration 6910/ 143000 | elapsed time per iteration (ms): 62636.2 | learning rate: 5.978E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.660750E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-29 23:51:34,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=6920, skipped=0, lr=[0.0005978205993183219, 0.0005978205993183219], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6920 loss: 2.6556 iter time (s): 62.285 samples/sec: 16.441 %comms: 0.002873381537035009 %optimizer_step 0.05635804956952808 %forward: 23.314367334759282 %backward: 62.62816131918113 [2025-03-29 23:51:34,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18091.08 | forward: 145213.81 | backward_microstep: 390089.80 | backward: 390080.24 | backward_inner_microstep: 390064.93 | backward_inner: 390057.44 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 351.03 | _step_clipping: 0.14 | _step_step: 349.37 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.440 | iteration 6920/ 143000 | elapsed time per iteration (ms): 62285.6 | learning rate: 5.978E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.661163E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 00:02:01,177] [INFO] [logging.py:60:log_dist] [Rank 0] step=6930, skipped=0, lr=[0.0005978126622294161, 0.0005978126622294161], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6930 loss: 2.6811 iter time (s): 62.679 samples/sec: 16.337 %comms: 0.0028698152999067964 %optimizer_step 0.05564328651517454 %forward: 23.17241972309721 %backward: 62.236765448924004 [2025-03-30 00:02:01,178] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21980.44 | forward: 145242.51 | backward_microstep: 390101.19 | backward: 390094.10 | backward_inner_microstep: 390077.13 | backward_inner: 390071.38 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.99 | reduce_grads: 0.18 | step: 348.77 | _step_clipping: 0.11 | _step_step: 347.07 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.337 | iteration 6930/ 143000 | elapsed time per iteration (ms): 62679.6 | learning rate: 5.978E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.666194E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 00:12:24,862] [INFO] [logging.py:60:log_dist] [Rank 0] step=6940, skipped=0, lr=[0.0005978047107667359, 0.0005978047107667359], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6940 loss: 2.6576 iter time (s): 62.368 samples/sec: 16.419 %comms: 0.002885391971332292 %optimizer_step 0.05608533924196857 %forward: 23.286711281307166 %backward: 62.557833431904655 [2025-03-30 00:12:24,862] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18813.16 | forward: 145234.43 | backward_microstep: 390168.11 | backward: 390160.33 | backward_inner_microstep: 390145.06 | backward_inner: 390139.05 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.18 | step: 349.79 | _step_clipping: 0.11 | _step_step: 348.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.419 | iteration 6940/ 143000 | elapsed time per iteration (ms): 62368.5 | learning rate: 5.978E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.671263E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 00:22:50,839] [INFO] [logging.py:60:log_dist] [Rank 0] step=6950, skipped=0, lr=[0.0005977967449306649, 0.0005977967449306649], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6950 loss: 2.6512 iter time (s): 62.597 samples/sec: 16.359 %comms: 0.002848656815005986 %optimizer_step 0.05536034985003665 %forward: 23.19657930827664 %backward: 62.310723595128614 [2025-03-30 00:22:50,840] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21302.39 | forward: 145204.16 | backward_microstep: 390054.16 | backward: 390047.86 | backward_inner_microstep: 390033.24 | backward_inner: 390027.71 | backward_allreduce_microstep: 7.06 | backward_allreduce: 2.42 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.19 | step: 346.54 | _step_clipping: 0.12 | _step_step: 344.92 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.358 | iteration 6950/ 143000 | elapsed time per iteration (ms): 62597.7 | learning rate: 5.978E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.661668E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 00:33:14,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=6960, skipped=0, lr=[0.0005977887647215877, 0.0005977887647215877], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6960 loss: 2.6510 iter time (s): 62.367 samples/sec: 16.419 %comms: 0.002927431522407164 %optimizer_step 0.057363008639559294 %forward: 23.28963331945101 %backward: 62.56085339930802 [2025-03-30 00:33:14,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18762.50 | forward: 145249.40 | backward_microstep: 390177.96 | backward: 390170.44 | backward_inner_microstep: 390154.47 | backward_inner: 390148.43 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.56 | reduce_tied_grads: 0.32 | comms: 18.26 | reduce_grads: 0.19 | step: 357.75 | _step_clipping: 0.32 | _step_step: 355.66 | _step_zero_grad: 0.49 | _step_check_overflow: 0.71 samples/sec: 16.419 | iteration 6960/ 143000 | elapsed time per iteration (ms): 62367.1 | learning rate: 5.978E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.651900E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 00:43:45,438] [INFO] [logging.py:60:log_dist] [Rank 0] step=6970, skipped=0, lr=[0.0005977807701398894, 0.0005977807701398894], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6970 loss: 2.6615 iter time (s): 63.092 samples/sec: 16.230 %comms: 0.00286220683421281 %optimizer_step 0.05671437410063245 %forward: 23.0276878266032 %backward: 61.8313871354089 [2025-03-30 00:43:45,439] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26086.70 | forward: 145286.81 | backward_microstep: 390115.99 | backward: 390107.98 | backward_inner_microstep: 390092.90 | backward_inner: 390087.10 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 18.06 | reduce_grads: 0.18 | step: 357.82 | _step_clipping: 0.12 | _step_step: 356.08 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.230 | iteration 6970/ 143000 | elapsed time per iteration (ms): 63092.8 | learning rate: 5.978E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.669231E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 00:54:10,968] [INFO] [logging.py:60:log_dist] [Rank 0] step=6980, skipped=0, lr=[0.0005977727611859557, 0.0005977727611859557], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6980 loss: 2.6349 iter time (s): 62.552 samples/sec: 16.370 %comms: 0.0028483323424633197 %optimizer_step 0.05505365607891667 %forward: 23.20114731512824 %backward: 62.34645851635796 [2025-03-30 00:54:10,968] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20985.33 | forward: 145128.89 | backward_microstep: 389998.61 | backward: 389992.46 | backward_inner_microstep: 389977.53 | backward_inner: 389971.97 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.18 | step: 344.37 | _step_clipping: 0.12 | _step_step: 342.73 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.370 | iteration 6980/ 143000 | elapsed time per iteration (ms): 62553.0 | learning rate: 5.978E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.657921E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 01:04:32,549] [INFO] [logging.py:60:log_dist] [Rank 0] step=6990, skipped=0, lr=[0.0005977647378601733, 0.0005977647378601733], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 6990 loss: 2.6970 iter time (s): 62.158 samples/sec: 16.474 %comms: 0.0028961542296896243 %optimizer_step 0.05590607551931757 %forward: 23.36393008306179 %backward: 62.752943371075375 [2025-03-30 01:04:32,550] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16868.92 | forward: 145224.54 | backward_microstep: 390063.96 | backward: 390057.13 | backward_inner_microstep: 390041.91 | backward_inner: 390035.83 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.19 | step: 347.50 | _step_clipping: 0.13 | _step_step: 345.79 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.474 | iteration 6990/ 143000 | elapsed time per iteration (ms): 62158.1 | learning rate: 5.978E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.674737E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 01:14:54,450] [INFO] [logging.py:60:log_dist] [Rank 0] step=7000, skipped=0, lr=[0.0005977567001629295, 0.0005977567001629295], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7000 loss: 2.6446 iter time (s): 62.190 samples/sec: 16.466 %comms: 0.0028620761007092685 %optimizer_step 0.056016374790953616 %forward: 23.337031539731292 %backward: 62.719450178615475 [2025-03-30 01:14:54,451] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17298.11 | forward: 145132.08 | backward_microstep: 390056.05 | backward: 390049.80 | backward_inner_microstep: 390033.35 | backward_inner: 390027.97 | backward_allreduce_microstep: 8.90 | backward_allreduce: 2.44 | reduce_tied_grads: 0.23 | comms: 17.80 | reduce_grads: 0.17 | step: 348.36 | _step_clipping: 0.12 | _step_step: 346.75 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.466 | iteration 7000/ 143000 | elapsed time per iteration (ms): 62190.1 | learning rate: 5.978E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.666360E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 01:14:57,262] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step7000/mp_rank_00_model_states.pt [2025-03-30 01:15:10,894] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-30 01:15:10,906] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step7000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-30 01:25:38,375] [INFO] [logging.py:60:log_dist] [Rank 0] step=7010, skipped=0, lr=[0.0005977486480946122, 0.0005977486480946122], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7010 loss: 2.6671 iter time (s): 62.746 samples/sec: 16.320 %comms: 0.002874486551114174 %optimizer_step 0.05585191698888352 %forward: 23.1807744943395 %backward: 62.180962647455274 [2025-03-30 01:25:38,376] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22378.90 | forward: 145449.07 | backward_microstep: 390166.03 | backward: 390157.94 | backward_inner_microstep: 390142.64 | backward_inner: 390136.74 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.50 | reduce_tied_grads: 0.29 | comms: 18.04 | reduce_grads: 0.19 | step: 350.45 | _step_clipping: 0.12 | _step_step: 348.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 15.902 | iteration 7010/ 143000 | elapsed time per iteration (ms): 64392.5 | learning rate: 5.977E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.675423E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 01:35:55,579] [INFO] [logging.py:60:log_dist] [Rank 0] step=7020, skipped=0, lr=[0.00059774058165561, 0.00059774058165561], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7020 loss: 2.6505 iter time (s): 61.720 samples/sec: 16.591 %comms: 0.00288714251721684 %optimizer_step 0.056301789980344644 %forward: 23.524556594349956 %backward: 63.20002684911202 [2025-03-30 01:35:55,580] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12500.80 | forward: 145193.25 | backward_microstep: 390076.13 | backward: 390069.71 | backward_inner_microstep: 390055.03 | backward_inner: 390049.71 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.41 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.19 | step: 347.49 | _step_clipping: 0.11 | _step_step: 345.87 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.591 | iteration 7020/ 143000 | elapsed time per iteration (ms): 61720.4 | learning rate: 5.977E-04 | approx flops per GPU: 71.6TFLOPS | lm_loss: 2.666750E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 01:46:18,725] [INFO] [logging.py:60:log_dist] [Rank 0] step=7030, skipped=0, lr=[0.0005977325008463122, 0.0005977325008463122], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7030 loss: 2.6586 iter time (s): 62.314 samples/sec: 16.433 %comms: 0.002870629521696561 %optimizer_step 0.056166551733271564 %forward: 23.30771280925837 %backward: 62.61025425679669 [2025-03-30 01:46:18,726] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18277.21 | forward: 145239.91 | backward_microstep: 390157.27 | backward: 390150.16 | backward_inner_microstep: 390135.26 | backward_inner: 390129.67 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 350.00 | _step_clipping: 0.13 | _step_step: 348.34 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.433 | iteration 7030/ 143000 | elapsed time per iteration (ms): 62314.6 | learning rate: 5.977E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.661590E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 01:56:40,791] [INFO] [logging.py:60:log_dist] [Rank 0] step=7040, skipped=0, lr=[0.0005977244056671088, 0.0005977244056671088], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7040 loss: 2.6515 iter time (s): 62.206 samples/sec: 16.461 %comms: 0.0028654616475282404 %optimizer_step 0.05609855116248619 %forward: 23.343338760292 %backward: 62.713562721869934 [2025-03-30 01:56:40,791] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17290.45 | forward: 145209.55 | backward_microstep: 390124.90 | backward: 390115.93 | backward_inner_microstep: 390100.95 | backward_inner: 390092.95 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.17 | step: 348.97 | _step_clipping: 0.12 | _step_step: 347.33 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.461 | iteration 7040/ 143000 | elapsed time per iteration (ms): 62206.5 | learning rate: 5.977E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.660688E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 02:07:02,674] [INFO] [logging.py:60:log_dist] [Rank 0] step=7050, skipped=0, lr=[0.0005977162961183906, 0.0005977162961183906], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7050 loss: 2.6425 iter time (s): 62.188 samples/sec: 16.466 %comms: 0.0028784908633793555 %optimizer_step 0.05578466296661173 %forward: 23.354060255400437 %backward: 62.715743318642716 [2025-03-30 02:07:02,675] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17187.56 | forward: 145233.79 | backward_microstep: 390022.34 | backward: 390015.50 | backward_inner_microstep: 390000.64 | backward_inner: 389995.02 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.46 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 346.91 | _step_clipping: 0.11 | _step_step: 345.09 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.466 | iteration 7050/ 143000 | elapsed time per iteration (ms): 62188.3 | learning rate: 5.977E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.652934E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 02:17:23,914] [INFO] [logging.py:60:log_dist] [Rank 0] step=7060, skipped=0, lr=[0.0005977081722005489, 0.0005977081722005489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7060 loss: 2.6409 iter time (s): 62.124 samples/sec: 16.483 %comms: 0.002878438395444394 %optimizer_step 0.05540159186355109 %forward: 23.3862745630728 %backward: 62.78386277756135 [2025-03-30 02:17:23,915] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16457.64 | forward: 145283.75 | backward_microstep: 390042.08 | backward: 390035.41 | backward_inner_microstep: 390018.44 | backward_inner: 390012.62 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 344.17 | _step_clipping: 0.12 | _step_step: 342.57 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.483 | iteration 7060/ 143000 | elapsed time per iteration (ms): 62124.0 | learning rate: 5.977E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.656720E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 02:27:46,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=7070, skipped=0, lr=[0.0005977000339139758, 0.0005977000339139758], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7070 loss: 2.6651 iter time (s): 62.229 samples/sec: 16.455 %comms: 0.0028810662883113013 %optimizer_step 0.05618251671025056 %forward: 23.337658859855544 %backward: 62.6832774363831 [2025-03-30 02:27:46,211] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17508.31 | forward: 145228.02 | backward_microstep: 390079.27 | backward: 390072.05 | backward_inner_microstep: 390056.99 | backward_inner: 390049.57 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 349.62 | _step_clipping: 0.13 | _step_step: 347.94 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.455 | iteration 7070/ 143000 | elapsed time per iteration (ms): 62229.6 | learning rate: 5.977E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.657641E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 02:38:15,019] [INFO] [logging.py:60:log_dist] [Rank 0] step=7080, skipped=0, lr=[0.0005976918812590642, 0.0005976918812590642], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7080 loss: 2.6484 iter time (s): 62.880 samples/sec: 16.285 %comms: 0.002858811741159765 %optimizer_step 0.05578015736345495 %forward: 23.10802251794532 %backward: 62.042174767516215 [2025-03-30 02:38:15,019] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23896.73 | forward: 145303.86 | backward_microstep: 390130.66 | backward: 390122.84 | backward_inner_microstep: 390107.30 | backward_inner: 390101.31 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.66 | reduce_tied_grads: 0.28 | comms: 17.98 | reduce_grads: 0.18 | step: 350.75 | _step_clipping: 0.12 | _step_step: 349.02 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.285 | iteration 7080/ 143000 | elapsed time per iteration (ms): 62880.8 | learning rate: 5.977E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.658399E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 02:48:42,055] [INFO] [logging.py:60:log_dist] [Rank 0] step=7090, skipped=0, lr=[0.0005976837142362076, 0.0005976837142362076], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7090 loss: 2.6459 iter time (s): 62.703 samples/sec: 16.331 %comms: 0.00286909862760833 %optimizer_step 0.057073640268832264 %forward: 23.163237990763083 %backward: 62.21520132363091 [2025-03-30 02:48:42,055] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22197.00 | forward: 145240.48 | backward_microstep: 390115.29 | backward: 390108.05 | backward_inner_microstep: 390092.86 | backward_inner: 390086.79 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.48 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.19 | step: 357.87 | _step_clipping: 0.16 | _step_step: 356.08 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.331 | iteration 7090/ 143000 | elapsed time per iteration (ms): 62703.6 | learning rate: 5.977E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.660768E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 02:59:07,977] [INFO] [logging.py:60:log_dist] [Rank 0] step=7100, skipped=0, lr=[0.0005976755328458, 0.0005976755328458], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7100 loss: 2.6566 iter time (s): 62.592 samples/sec: 16.360 %comms: 0.0028656682168229555 %optimizer_step 0.05709864491203558 %forward: 23.204115692866036 %backward: 62.325259093600835 [2025-03-30 02:59:07,978] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21133.03 | forward: 145238.52 | backward_microstep: 390111.71 | backward: 390104.44 | backward_inner_microstep: 390089.00 | backward_inner: 390083.14 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.49 | reduce_tied_grads: 0.32 | comms: 17.94 | reduce_grads: 0.19 | step: 357.39 | _step_clipping: 0.14 | _step_step: 355.59 | _step_zero_grad: 0.48 | _step_check_overflow: 0.61 samples/sec: 16.360 | iteration 7100/ 143000 | elapsed time per iteration (ms): 62592.3 | learning rate: 5.977E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.662985E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 03:09:34,988] [INFO] [logging.py:60:log_dist] [Rank 0] step=7110, skipped=0, lr=[0.0005976673370882364, 0.0005976673370882364], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7110 loss: 2.6655 iter time (s): 62.700 samples/sec: 16.332 %comms: 0.0028649176945000146 %optimizer_step 0.05618300454347739 %forward: 23.16107021753298 %backward: 62.229819007449095 [2025-03-30 03:09:34,988] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22137.71 | forward: 145221.02 | backward_microstep: 390191.13 | backward: 390183.94 | backward_inner_microstep: 390168.79 | backward_inner: 390161.31 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.18 | step: 352.27 | _step_clipping: 0.15 | _step_step: 350.48 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.331 | iteration 7110/ 143000 | elapsed time per iteration (ms): 62701.0 | learning rate: 5.977E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.658577E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 03:20:01,515] [INFO] [logging.py:60:log_dist] [Rank 0] step=7120, skipped=0, lr=[0.0005976591269639124, 0.0005976591269639124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7120 loss: 2.6487 iter time (s): 62.652 samples/sec: 16.344 %comms: 0.002867582238405599 %optimizer_step 0.05611742945740653 %forward: 23.185347323984775 %backward: 62.26850748365062 [2025-03-30 03:20:01,516] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21676.90 | forward: 145261.29 | backward_microstep: 390132.86 | backward: 390125.86 | backward_inner_microstep: 390110.74 | backward_inner: 390104.72 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.18 | step: 351.59 | _step_clipping: 0.12 | _step_step: 349.90 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.344 | iteration 7120/ 143000 | elapsed time per iteration (ms): 62652.7 | learning rate: 5.977E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.656884E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 03:30:23,219] [INFO] [logging.py:60:log_dist] [Rank 0] step=7130, skipped=0, lr=[0.000597650902473224, 0.000597650902473224], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7130 loss: 2.6421 iter time (s): 62.170 samples/sec: 16.471 %comms: 0.0028760246016927686 %optimizer_step 0.05611547952192499 %forward: 23.355995720737706 %backward: 62.73567236918628 [2025-03-30 03:30:23,220] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17020.76 | forward: 145203.87 | backward_microstep: 390033.23 | backward: 390026.72 | backward_inner_microstep: 390011.89 | backward_inner: 390004.63 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 348.87 | _step_clipping: 0.12 | _step_step: 347.18 | _step_zero_grad: 0.45 | _step_check_overflow: 0.58 samples/sec: 16.471 | iteration 7130/ 143000 | elapsed time per iteration (ms): 62170.4 | learning rate: 5.977E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.653721E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 03:40:45,092] [INFO] [logging.py:60:log_dist] [Rank 0] step=7140, skipped=0, lr=[0.0005976426636165685, 0.0005976426636165685], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7140 loss: 2.6560 iter time (s): 62.187 samples/sec: 16.467 %comms: 0.0028760105996067462 %optimizer_step 0.05625448361867072 %forward: 23.354468461675047 %backward: 62.74044044786342 [2025-03-30 03:40:45,092] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17017.66 | forward: 145233.80 | backward_microstep: 390171.01 | backward: 390162.29 | backward_inner_microstep: 390147.55 | backward_inner: 390141.94 | backward_allreduce_microstep: 7.09 | backward_allreduce: 2.44 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 349.83 | _step_clipping: 0.12 | _step_step: 348.18 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.466 | iteration 7140/ 143000 | elapsed time per iteration (ms): 62187.3 | learning rate: 5.976E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.644637E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 03:51:08,088] [INFO] [logging.py:60:log_dist] [Rank 0] step=7150, skipped=0, lr=[0.0005976344103943434, 0.0005976344103943434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7150 loss: 2.6532 iter time (s): 62.299 samples/sec: 16.437 %comms: 0.0028923700276369826 %optimizer_step 0.056376033879202925 %forward: 23.326326273729403 %backward: 62.63771139313038 [2025-03-30 03:51:08,088] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17982.91 | forward: 145320.87 | backward_microstep: 390235.44 | backward: 390227.18 | backward_inner_microstep: 390207.98 | backward_inner: 390201.98 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.53 | reduce_tied_grads: 0.30 | comms: 18.02 | reduce_grads: 0.18 | step: 351.22 | _step_clipping: 0.11 | _step_step: 349.59 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.437 | iteration 7150/ 143000 | elapsed time per iteration (ms): 62299.6 | learning rate: 5.976E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.643949E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 04:01:30,018] [INFO] [logging.py:60:log_dist] [Rank 0] step=7160, skipped=0, lr=[0.0005976261428069468, 0.0005976261428069468], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7160 loss: 2.6256 iter time (s): 62.193 samples/sec: 16.465 %comms: 0.0028709124305602825 %optimizer_step 0.05640266271594195 %forward: 23.3377485072691 %backward: 62.709708395533404 [2025-03-30 04:01:30,019] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17320.75 | forward: 145143.35 | backward_microstep: 390013.64 | backward: 390007.50 | backward_inner_microstep: 389992.94 | backward_inner: 389987.50 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.43 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.17 | step: 350.78 | _step_clipping: 0.12 | _step_step: 349.13 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.465 | iteration 7160/ 143000 | elapsed time per iteration (ms): 62193.1 | learning rate: 5.976E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.635629E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 04:11:56,079] [INFO] [logging.py:60:log_dist] [Rank 0] step=7170, skipped=0, lr=[0.0005976178608547781, 0.0005976178608547781], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7170 loss: 2.6550 iter time (s): 62.606 samples/sec: 16.356 %comms: 0.0028619873478444497 %optimizer_step 0.05931772677958343 %forward: 23.196321654067763 %backward: 62.304052038985226 [2025-03-30 04:11:56,080] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21282.10 | forward: 145221.88 | backward_microstep: 390065.20 | backward: 390058.03 | backward_inner_microstep: 390043.12 | backward_inner: 390037.29 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.27 | comms: 17.92 | reduce_grads: 0.18 | step: 371.36 | _step_clipping: 0.11 | _step_step: 369.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.356 | iteration 7170/ 143000 | elapsed time per iteration (ms): 62606.1 | learning rate: 5.976E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.642912E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 04:22:23,942] [INFO] [logging.py:60:log_dist] [Rank 0] step=7180, skipped=0, lr=[0.0005976095645382369, 0.0005976095645382369], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7180 loss: 2.6527 iter time (s): 62.786 samples/sec: 16.309 %comms: 0.002847318664383914 %optimizer_step 0.055567522523195544 %forward: 23.132683007552856 %backward: 62.13474310271837 [2025-03-30 04:22:23,942] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23030.95 | forward: 145240.27 | backward_microstep: 390125.21 | backward: 390117.60 | backward_inner_microstep: 390102.25 | backward_inner: 390096.32 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 348.88 | _step_clipping: 0.14 | _step_step: 347.27 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.309 | iteration 7180/ 143000 | elapsed time per iteration (ms): 62786.3 | learning rate: 5.976E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.652653E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 04:32:46,671] [INFO] [logging.py:60:log_dist] [Rank 0] step=7190, skipped=0, lr=[0.0005976012538577236, 0.0005976012538577236], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7190 loss: 2.6416 iter time (s): 62.272 samples/sec: 16.444 %comms: 0.0028917324087838953 %optimizer_step 0.056349021559966836 %forward: 23.32911233207849 %backward: 62.654713977194554 [2025-03-30 04:32:46,672] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17794.18 | forward: 145276.02 | backward_microstep: 390173.50 | backward: 390166.05 | backward_inner_microstep: 390150.72 | backward_inner: 390144.78 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.18 | step: 350.90 | _step_clipping: 0.13 | _step_step: 349.17 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.444 | iteration 7190/ 143000 | elapsed time per iteration (ms): 62273.0 | learning rate: 5.976E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.649541E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 04:43:13,415] [INFO] [logging.py:60:log_dist] [Rank 0] step=7200, skipped=0, lr=[0.0005975929288136393, 0.0005975929288136393], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7200 loss: 2.6449 iter time (s): 62.674 samples/sec: 16.339 %comms: 0.0028506515242119027 %optimizer_step 0.055360675146274074 %forward: 23.17958255676259 %backward: 62.246785826497316 [2025-03-30 04:43:13,416] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21856.73 | forward: 145275.40 | backward_microstep: 390131.58 | backward: 390124.66 | backward_inner_microstep: 390109.65 | backward_inner: 390103.99 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 346.97 | _step_clipping: 0.12 | _step_step: 345.30 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.338 | iteration 7200/ 143000 | elapsed time per iteration (ms): 62674.4 | learning rate: 5.976E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.634272E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 04:53:35,499] [INFO] [logging.py:60:log_dist] [Rank 0] step=7210, skipped=0, lr=[0.0005975845894063856, 0.0005975845894063856], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7210 loss: 2.6366 iter time (s): 62.208 samples/sec: 16.461 %comms: 0.0028863407277827393 %optimizer_step 0.056159927449374855 %forward: 23.33549261335296 %backward: 62.694906476663945 [2025-03-30 04:53:35,500] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17439.89 | forward: 145165.07 | backward_microstep: 390018.22 | backward: 390011.49 | backward_inner_microstep: 389996.25 | backward_inner: 389990.55 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.18 | step: 349.36 | _step_clipping: 0.12 | _step_step: 347.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.461 | iteration 7210/ 143000 | elapsed time per iteration (ms): 62208.4 | learning rate: 5.976E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.651143E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 05:04:01,617] [INFO] [logging.py:60:log_dist] [Rank 0] step=7220, skipped=0, lr=[0.0005975762356363655, 0.0005975762356363655], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7220 loss: 2.6560 iter time (s): 62.611 samples/sec: 16.355 %comms: 0.0028724276050037306 %optimizer_step 0.05574770696061182 %forward: 23.187225648473582 %backward: 62.296532049644206 [2025-03-30 05:04:01,617] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21434.82 | forward: 145178.12 | backward_microstep: 390052.65 | backward: 390046.38 | backward_inner_microstep: 390031.58 | backward_inner: 390024.35 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.24 | comms: 17.98 | reduce_grads: 0.17 | step: 349.04 | _step_clipping: 0.11 | _step_step: 347.41 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.355 | iteration 7220/ 143000 | elapsed time per iteration (ms): 62611.8 | learning rate: 5.976E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.646499E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 05:14:28,915] [INFO] [logging.py:60:log_dist] [Rank 0] step=7230, skipped=0, lr=[0.0005975678675039818, 0.0005975678675039818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7230 loss: 2.6417 iter time (s): 62.729 samples/sec: 16.324 %comms: 0.0028415512159823643 %optimizer_step 0.055622789232238394 %forward: 23.153231292796637 %backward: 62.18471072327139 [2025-03-30 05:14:28,915] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22503.03 | forward: 145236.95 | backward_microstep: 390082.58 | backward: 390075.92 | backward_inner_microstep: 390057.64 | backward_inner: 390052.06 | backward_allreduce_microstep: 8.82 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.18 | step: 348.91 | _step_clipping: 0.12 | _step_step: 347.08 | _step_zero_grad: 0.48 | _step_check_overflow: 0.69 samples/sec: 16.324 | iteration 7230/ 143000 | elapsed time per iteration (ms): 62729.8 | learning rate: 5.976E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.646836E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 05:24:54,463] [INFO] [logging.py:60:log_dist] [Rank 0] step=7240, skipped=0, lr=[0.0005975594850096384, 0.0005975594850096384], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7240 loss: 2.6380 iter time (s): 62.554 samples/sec: 16.370 %comms: 0.0028526291991562707 %optimizer_step 0.0552074937361997 %forward: 23.222729216170244 %backward: 62.367682021862294 [2025-03-30 05:24:54,464] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20696.90 | forward: 145268.29 | backward_microstep: 390145.01 | backward: 390137.03 | backward_inner_microstep: 390122.20 | backward_inner: 390116.54 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.18 | step: 345.35 | _step_clipping: 0.12 | _step_step: 343.73 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.370 | iteration 7240/ 143000 | elapsed time per iteration (ms): 62554.9 | learning rate: 5.976E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.639671E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 05:35:16,692] [INFO] [logging.py:60:log_dist] [Rank 0] step=7250, skipped=0, lr=[0.00059755108815374, 0.00059755108815374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7250 loss: 2.6483 iter time (s): 62.222 samples/sec: 16.457 %comms: 0.002875667877830457 %optimizer_step 0.05641671916484499 %forward: 23.335626979479475 %backward: 62.703498922612745 [2025-03-30 05:35:16,692] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17401.70 | forward: 145199.71 | backward_microstep: 390162.10 | backward: 390155.79 | backward_inner_microstep: 390141.07 | backward_inner: 390135.55 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 351.04 | _step_clipping: 0.12 | _step_step: 349.33 | _step_zero_grad: 0.45 | _step_check_overflow: 0.59 samples/sec: 16.457 | iteration 7250/ 143000 | elapsed time per iteration (ms): 62222.9 | learning rate: 5.976E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.646635E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 05:45:41,546] [INFO] [logging.py:60:log_dist] [Rank 0] step=7260, skipped=0, lr=[0.0005975426769366919, 0.0005975426769366919], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7260 loss: 2.6552 iter time (s): 62.485 samples/sec: 16.388 %comms: 0.00287205742014111 %optimizer_step 0.05549941612806086 %forward: 23.23517457283878 %backward: 62.43657833984777 [2025-03-30 05:45:41,546] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20086.71 | forward: 145184.62 | backward_microstep: 390140.42 | backward: 390133.96 | backward_inner_microstep: 390119.06 | backward_inner: 390113.40 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.95 | reduce_grads: 0.18 | step: 346.79 | _step_clipping: 0.12 | _step_step: 345.11 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.388 | iteration 7260/ 143000 | elapsed time per iteration (ms): 62485.4 | learning rate: 5.975E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.640834E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 05:56:08,183] [INFO] [logging.py:60:log_dist] [Rank 0] step=7270, skipped=0, lr=[0.0005975342513588999, 0.0005975342513588999], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7270 loss: 2.6409 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.002857909704958102 %optimizer_step 0.05586594438247545 %forward: 23.175758109697266 %backward: 62.25994773624546 [2025-03-30 05:56:08,183] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21808.70 | forward: 145226.69 | backward_microstep: 390147.14 | backward: 390140.68 | backward_inner_microstep: 390125.52 | backward_inner: 390119.92 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 350.07 | _step_clipping: 0.12 | _step_step: 348.37 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.341 | iteration 7270/ 143000 | elapsed time per iteration (ms): 62663.7 | learning rate: 5.975E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.648583E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 06:06:33,121] [INFO] [logging.py:60:log_dist] [Rank 0] step=7280, skipped=0, lr=[0.0005975258114207708, 0.0005975258114207708], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7280 loss: 2.6528 iter time (s): 62.493 samples/sec: 16.386 %comms: 0.002858966973406034 %optimizer_step 0.05649834237084963 %forward: 23.245677206779973 %backward: 62.433001679086665 [2025-03-30 06:06:33,121] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20020.63 | forward: 145269.77 | backward_microstep: 390170.80 | backward: 390164.07 | backward_inner_microstep: 390148.81 | backward_inner: 390143.12 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.18 | step: 353.08 | _step_clipping: 0.11 | _step_step: 351.44 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.386 | iteration 7280/ 143000 | elapsed time per iteration (ms): 62493.8 | learning rate: 5.975E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.650277E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 06:17:00,800] [INFO] [logging.py:60:log_dist] [Rank 0] step=7290, skipped=0, lr=[0.0005975173571227118, 0.0005975173571227118], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7290 loss: 2.6405 iter time (s): 62.767 samples/sec: 16.314 %comms: 0.002845036762849042 %optimizer_step 0.055841158481284264 %forward: 23.140731848953468 %backward: 62.15472956239248 [2025-03-30 06:17:00,800] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22844.01 | forward: 145248.32 | backward_microstep: 390135.26 | backward: 390128.98 | backward_inner_microstep: 390113.98 | backward_inner: 390108.50 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.17 | step: 350.50 | _step_clipping: 0.13 | _step_step: 348.85 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.314 | iteration 7290/ 143000 | elapsed time per iteration (ms): 62767.9 | learning rate: 5.975E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.652797E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 06:27:18,873] [INFO] [logging.py:60:log_dist] [Rank 0] step=7300, skipped=0, lr=[0.0005975088884651311, 0.0005975088884651311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7300 loss: 2.6348 iter time (s): 61.807 samples/sec: 16.568 %comms: 0.0029161016354894457 %optimizer_step 0.05826444051255909 %forward: 23.492556302823463 %backward: 63.13325724680159 [2025-03-30 06:27:18,873] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13169.13 | forward: 145199.97 | backward_microstep: 390213.54 | backward: 390206.46 | backward_inner_microstep: 390191.26 | backward_inner: 390183.82 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 18.02 | reduce_grads: 0.18 | step: 360.11 | _step_clipping: 0.12 | _step_step: 358.48 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.568 | iteration 7300/ 143000 | elapsed time per iteration (ms): 61807.3 | learning rate: 5.975E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.629783E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 06:37:41,473] [INFO] [logging.py:60:log_dist] [Rank 0] step=7310, skipped=0, lr=[0.0005975004054484374, 0.0005975004054484374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7310 loss: 2.6466 iter time (s): 62.260 samples/sec: 16.447 %comms: 0.002884634122318039 %optimizer_step 0.05619296220777298 %forward: 23.315036491127135 %backward: 62.637291857265765 [2025-03-30 06:37:41,474] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18012.25 | forward: 145158.31 | backward_microstep: 389982.85 | backward: 389976.80 | backward_inner_microstep: 389962.08 | backward_inner: 389956.66 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.96 | reduce_grads: 0.18 | step: 349.85 | _step_clipping: 0.12 | _step_step: 348.16 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.447 | iteration 7310/ 143000 | elapsed time per iteration (ms): 62260.0 | learning rate: 5.975E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.638054E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 06:48:08,219] [INFO] [logging.py:60:log_dist] [Rank 0] step=7320, skipped=0, lr=[0.0005974919080730401, 0.0005974919080730401], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7320 loss: 2.6412 iter time (s): 62.674 samples/sec: 16.339 %comms: 0.002850833319737784 %optimizer_step 0.05498911718793025 %forward: 23.176026306566694 %backward: 62.22782102928031 [2025-03-30 06:48:08,220] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22011.32 | forward: 145253.54 | backward_microstep: 390017.25 | backward: 390006.95 | backward_inner_microstep: 389988.61 | backward_inner: 389983.06 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 344.64 | _step_clipping: 0.11 | _step_step: 343.01 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.338 | iteration 7320/ 143000 | elapsed time per iteration (ms): 62674.6 | learning rate: 5.975E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.644423E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 06:58:34,522] [INFO] [logging.py:60:log_dist] [Rank 0] step=7330, skipped=0, lr=[0.0005974833963393493, 0.0005974833963393493], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7330 loss: 2.6224 iter time (s): 62.630 samples/sec: 16.350 %comms: 0.002865793787066117 %optimizer_step 0.055878620066840315 %forward: 23.17571124385294 %backward: 62.279864555217955 [2025-03-30 06:58:34,522] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21619.42 | forward: 145148.85 | backward_microstep: 390063.97 | backward: 390057.11 | backward_inner_microstep: 390041.90 | backward_inner: 390036.13 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.18 | step: 349.97 | _step_clipping: 0.14 | _step_step: 348.28 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.350 | iteration 7330/ 143000 | elapsed time per iteration (ms): 62630.3 | learning rate: 5.975E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.642317E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 07:08:56,441] [INFO] [logging.py:60:log_dist] [Rank 0] step=7340, skipped=0, lr=[0.0005974748702477757, 0.0005974748702477757], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7340 loss: 2.6342 iter time (s): 62.191 samples/sec: 16.465 %comms: 0.0028783620836706405 %optimizer_step 0.055441841813515844 %forward: 23.336863932985885 %backward: 62.70346980250541 [2025-03-30 07:08:56,442] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17370.64 | forward: 145135.28 | backward_microstep: 389967.60 | backward: 389961.81 | backward_inner_microstep: 389945.36 | backward_inner: 389939.92 | backward_allreduce_microstep: 8.97 | backward_allreduce: 4.32 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.19 | step: 344.80 | _step_clipping: 0.14 | _step_step: 343.16 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.465 | iteration 7340/ 143000 | elapsed time per iteration (ms): 62191.9 | learning rate: 5.975E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.635301E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 07:19:23,575] [INFO] [logging.py:60:log_dist] [Rank 0] step=7350, skipped=0, lr=[0.0005974663297987311, 0.0005974663297987311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7350 loss: 2.6282 iter time (s): 62.713 samples/sec: 16.328 %comms: 0.0031161806995658584 %optimizer_step 0.05693714204239518 %forward: 23.16099006794245 %backward: 62.20267974280216 [2025-03-30 07:19:23,576] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22302.94 | forward: 145249.16 | backward_microstep: 390097.54 | backward: 390090.70 | backward_inner_microstep: 390075.75 | backward_inner: 390069.82 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.45 | reduce_tied_grads: 1.97 | comms: 19.54 | reduce_grads: 0.18 | step: 357.07 | _step_clipping: 0.12 | _step_step: 355.35 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.328 | iteration 7350/ 143000 | elapsed time per iteration (ms): 62713.4 | learning rate: 5.975E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.643795E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 07:29:51,285] [INFO] [logging.py:60:log_dist] [Rank 0] step=7360, skipped=0, lr=[0.0005974577749926275, 0.0005974577749926275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7360 loss: 2.6473 iter time (s): 62.770 samples/sec: 16.313 %comms: 0.0028937055301441732 %optimizer_step 0.055853245567360446 %forward: 23.130333006044733 %backward: 62.14133992729762 [2025-03-30 07:29:51,285] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23015.16 | forward: 145190.14 | backward_microstep: 390070.52 | backward: 390063.98 | backward_inner_microstep: 390048.84 | backward_inner: 390043.36 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 18.16 | reduce_grads: 0.18 | step: 350.59 | _step_clipping: 0.12 | _step_step: 348.74 | _step_zero_grad: 0.46 | _step_check_overflow: 0.73 samples/sec: 16.313 | iteration 7360/ 143000 | elapsed time per iteration (ms): 62771.0 | learning rate: 5.975E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.646041E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 07:40:12,910] [INFO] [logging.py:60:log_dist] [Rank 0] step=7370, skipped=0, lr=[0.0005974492058298779, 0.0005974492058298779], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7370 loss: 2.6335 iter time (s): 62.162 samples/sec: 16.473 %comms: 0.002875046814485851 %optimizer_step 0.056115613238857706 %forward: 23.358222341801397 %backward: 62.757300883083936 [2025-03-30 07:40:12,910] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16830.12 | forward: 145199.31 | backward_microstep: 390118.88 | backward: 390111.73 | backward_inner_microstep: 390096.54 | backward_inner: 390089.06 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.18 | step: 348.83 | _step_clipping: 0.14 | _step_step: 347.17 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.473 | iteration 7370/ 143000 | elapsed time per iteration (ms): 62162.5 | learning rate: 5.974E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.628769E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 07:50:40,617] [INFO] [logging.py:60:log_dist] [Rank 0] step=7380, skipped=0, lr=[0.0005974406223108958, 0.0005974406223108958], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7380 loss: 2.6419 iter time (s): 62.770 samples/sec: 16.313 %comms: 0.002849052097873621 %optimizer_step 0.05590008935640608 %forward: 23.12545475077508 %backward: 62.13599506569047 [2025-03-30 07:50:40,617] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23077.40 | forward: 145158.80 | backward_microstep: 390035.15 | backward: 390028.50 | backward_inner_microstep: 390013.72 | backward_inner: 390008.18 | backward_allreduce_microstep: 7.10 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 350.89 | _step_clipping: 0.10 | _step_step: 349.25 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.313 | iteration 7380/ 143000 | elapsed time per iteration (ms): 62770.7 | learning rate: 5.974E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.639918E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 08:01:06,869] [INFO] [logging.py:60:log_dist] [Rank 0] step=7390, skipped=0, lr=[0.0005974320244360955, 0.0005974320244360955], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7390 loss: 2.6392 iter time (s): 62.625 samples/sec: 16.351 %comms: 0.0028546403127393807 %optimizer_step 0.055369141373092325 %forward: 23.183116199679993 %backward: 62.28307312933965 [2025-03-30 08:01:06,870] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21562.41 | forward: 145183.59 | backward_microstep: 390052.69 | backward: 390045.93 | backward_inner_microstep: 390027.39 | backward_inner: 390021.73 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 346.75 | _step_clipping: 0.13 | _step_step: 345.16 | _step_zero_grad: 0.45 | _step_check_overflow: 0.47 samples/sec: 16.351 | iteration 7390/ 143000 | elapsed time per iteration (ms): 62625.2 | learning rate: 5.974E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.634767E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 08:11:28,352] [INFO] [logging.py:60:log_dist] [Rank 0] step=7400, skipped=0, lr=[0.000597423412205892, 0.000597423412205892], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7400 loss: 2.6427 iter time (s): 62.148 samples/sec: 16.477 %comms: 0.002889359540461705 %optimizer_step 0.05605290756602346 %forward: 23.349706108581042 %backward: 62.75812869407111 [2025-03-30 08:11:28,353] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16908.44 | forward: 145113.29 | backward_microstep: 390034.62 | backward: 390027.96 | backward_inner_microstep: 390013.26 | backward_inner: 390007.70 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.42 | reduce_tied_grads: 0.27 | comms: 17.96 | reduce_grads: 0.18 | step: 348.36 | _step_clipping: 0.13 | _step_step: 346.69 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.477 | iteration 7400/ 143000 | elapsed time per iteration (ms): 62148.4 | learning rate: 5.974E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.635754E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 08:21:53,679] [INFO] [logging.py:60:log_dist] [Rank 0] step=7410, skipped=0, lr=[0.0005974147856207009, 0.0005974147856207009], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7410 loss: 2.6353 iter time (s): 62.532 samples/sec: 16.376 %comms: 0.002860549064293797 %optimizer_step 0.05562121968609995 %forward: 23.212288317281764 %backward: 62.37178803940711 [2025-03-30 08:21:53,680] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20718.14 | forward: 145151.14 | backward_microstep: 390029.42 | backward: 390023.42 | backward_inner_microstep: 390008.68 | backward_inner: 390003.12 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 347.81 | _step_clipping: 0.12 | _step_step: 346.15 | _step_zero_grad: 0.45 | _step_check_overflow: 0.56 samples/sec: 16.375 | iteration 7410/ 143000 | elapsed time per iteration (ms): 62532.6 | learning rate: 5.974E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.639519E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 08:32:20,807] [INFO] [logging.py:60:log_dist] [Rank 0] step=7420, skipped=0, lr=[0.0005974061446809385, 0.0005974061446809385], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7420 loss: 2.6247 iter time (s): 62.712 samples/sec: 16.329 %comms: 0.002850729911860103 %optimizer_step 0.05553954396095299 %forward: 23.15046168594363 %backward: 62.20147374889561 [2025-03-30 08:32:20,808] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22415.78 | forward: 145181.84 | backward_microstep: 390086.40 | backward: 390079.66 | backward_inner_microstep: 390064.58 | backward_inner: 390058.83 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 348.30 | _step_clipping: 0.12 | _step_step: 346.68 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.328 | iteration 7420/ 143000 | elapsed time per iteration (ms): 62712.8 | learning rate: 5.974E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.635685E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 08:42:48,096] [INFO] [logging.py:60:log_dist] [Rank 0] step=7430, skipped=0, lr=[0.0005973974893870221, 0.0005973974893870221], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7430 loss: 2.6253 iter time (s): 62.728 samples/sec: 16.324 %comms: 0.002841753074160943 %optimizer_step 0.05600389992496787 %forward: 23.14746023215393 %backward: 62.192654391790406 [2025-03-30 08:42:48,096] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22487.91 | forward: 145200.15 | backward_microstep: 390130.61 | backward: 390124.12 | backward_inner_microstep: 390107.74 | backward_inner: 390102.24 | backward_allreduce_microstep: 8.80 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 351.30 | _step_clipping: 0.11 | _step_step: 349.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.324 | iteration 7430/ 143000 | elapsed time per iteration (ms): 62728.8 | learning rate: 5.974E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.629723E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 08:53:11,704] [INFO] [logging.py:60:log_dist] [Rank 0] step=7440, skipped=0, lr=[0.0005973888197393692, 0.0005973888197393692], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7440 loss: 2.6181 iter time (s): 62.360 samples/sec: 16.421 %comms: 0.0028764517454176602 %optimizer_step 0.056256546426207206 %forward: 23.29265453154341 %backward: 62.561451773227894 [2025-03-30 08:53:11,705] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18745.17 | forward: 145253.86 | backward_microstep: 390142.81 | backward: 390135.54 | backward_inner_microstep: 390120.31 | backward_inner: 390114.54 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.53 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.18 | step: 350.82 | _step_clipping: 0.12 | _step_step: 349.13 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.421 | iteration 7440/ 143000 | elapsed time per iteration (ms): 62360.9 | learning rate: 5.974E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.630055E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 09:03:38,773] [INFO] [logging.py:60:log_dist] [Rank 0] step=7450, skipped=0, lr=[0.0005973801357383984, 0.0005973801357383984], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7450 loss: 2.6410 iter time (s): 62.706 samples/sec: 16.330 %comms: 0.0028818380953541584 %optimizer_step 0.055682229693483645 %forward: 23.16238843876044 %backward: 62.19625428376041 [2025-03-30 09:03:38,774] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22348.01 | forward: 145242.73 | backward_microstep: 390016.43 | backward: 390009.61 | backward_inner_microstep: 389992.82 | backward_inner: 389987.04 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.19 | step: 349.16 | _step_clipping: 0.13 | _step_step: 347.47 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.330 | iteration 7450/ 143000 | elapsed time per iteration (ms): 62706.8 | learning rate: 5.974E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.635373E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 09:14:06,252] [INFO] [logging.py:60:log_dist] [Rank 0] step=7460, skipped=0, lr=[0.0005973714373845286, 0.0005973714373845286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7460 loss: 2.6232 iter time (s): 62.747 samples/sec: 16.319 %comms: 0.002834164581063441 %optimizer_step 0.05537377763652863 %forward: 23.122503275170885 %backward: 62.14520214413658 [2025-03-30 09:14:06,253] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23009.00 | forward: 145087.66 | backward_microstep: 389950.67 | backward: 389944.90 | backward_inner_microstep: 389930.07 | backward_inner: 389922.96 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.49 | reduce_tied_grads: 0.23 | comms: 17.78 | reduce_grads: 0.18 | step: 347.46 | _step_clipping: 0.11 | _step_step: 345.80 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.319 | iteration 7460/ 143000 | elapsed time per iteration (ms): 62747.9 | learning rate: 5.974E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.633860E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 09:24:31,665] [INFO] [logging.py:60:log_dist] [Rank 0] step=7470, skipped=0, lr=[0.0005973627246781799, 0.0005973627246781799], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7470 loss: 2.6426 iter time (s): 62.541 samples/sec: 16.373 %comms: 0.002858625137671596 %optimizer_step 0.05523772902250736 %forward: 23.209494859387743 %backward: 62.35916054661651 [2025-03-30 09:24:31,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20793.62 | forward: 145153.92 | backward_microstep: 390004.94 | backward: 389998.86 | backward_inner_microstep: 389984.05 | backward_inner: 389978.60 | backward_allreduce_microstep: 7.27 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.88 | reduce_grads: 0.17 | step: 345.46 | _step_clipping: 0.11 | _step_step: 343.85 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.373 | iteration 7470/ 143000 | elapsed time per iteration (ms): 62541.3 | learning rate: 5.974E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.632442E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 09:34:53,109] [INFO] [logging.py:60:log_dist] [Rank 0] step=7480, skipped=0, lr=[0.0005973539976197727, 0.0005973539976197727], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7480 loss: 2.6146 iter time (s): 62.144 samples/sec: 16.478 %comms: 0.0028707048004992237 %optimizer_step 0.060199168827660916 %forward: 23.361320278515034 %backward: 62.77046136185472 [2025-03-30 09:34:53,110] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16688.40 | forward: 145176.28 | backward_microstep: 390086.21 | backward: 390079.93 | backward_inner_microstep: 390064.91 | backward_inner: 390059.30 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.49 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.18 | step: 374.10 | _step_clipping: 0.13 | _step_step: 372.41 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.478 | iteration 7480/ 143000 | elapsed time per iteration (ms): 62144.4 | learning rate: 5.974E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.627965E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 09:45:16,203] [INFO] [logging.py:60:log_dist] [Rank 0] step=7490, skipped=0, lr=[0.0005973452562097281, 0.0005973452562097281], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7490 loss: 2.6296 iter time (s): 62.309 samples/sec: 16.434 %comms: 0.00287354827234878 %optimizer_step 0.057375420292163824 %forward: 23.308729262133166 %backward: 62.60010558493407 [2025-03-30 09:45:16,204] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18321.31 | forward: 145234.09 | backward_microstep: 390060.97 | backward: 390054.27 | backward_inner_microstep: 390039.67 | backward_inner: 390034.24 | backward_allreduce_microstep: 7.03 | backward_allreduce: 2.42 | reduce_tied_grads: 0.30 | comms: 17.90 | reduce_grads: 0.18 | step: 357.50 | _step_clipping: 0.13 | _step_step: 355.80 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.434 | iteration 7490/ 143000 | elapsed time per iteration (ms): 62309.4 | learning rate: 5.973E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.619594E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 09:55:44,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=7500, skipped=0, lr=[0.0005973365004484682, 0.0005973365004484682], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7500 loss: 2.6126 iter time (s): 62.800 samples/sec: 16.306 %comms: 0.002867658501670775 %optimizer_step 0.05592934446319427 %forward: 23.123890574973085 %backward: 62.110081325310695 [2025-03-30 09:55:44,211] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23262.09 | forward: 145218.45 | backward_microstep: 390058.77 | backward: 390052.44 | backward_inner_microstep: 390037.50 | backward_inner: 390031.92 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 18.01 | reduce_grads: 0.18 | step: 351.24 | _step_clipping: 0.12 | _step_step: 349.51 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.306 | iteration 7500/ 143000 | elapsed time per iteration (ms): 62800.7 | learning rate: 5.973E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.617453E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 10:06:01,327] [INFO] [logging.py:60:log_dist] [Rank 0] step=7510, skipped=0, lr=[0.0005973277303364155, 0.0005973277303364155], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7510 loss: 2.6103 iter time (s): 61.711 samples/sec: 16.593 %comms: 0.0028916450815009683 %optimizer_step 0.05607962496268115 %forward: 23.51600463215688 %backward: 63.1945662107687 [2025-03-30 10:06:01,328] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12586.41 | forward: 145119.99 | backward_microstep: 389986.72 | backward: 389980.99 | backward_inner_microstep: 389964.78 | backward_inner: 389959.59 | backward_allreduce_microstep: 8.67 | backward_allreduce: 4.05 | reduce_tied_grads: 0.23 | comms: 17.84 | reduce_grads: 0.17 | step: 346.07 | _step_clipping: 0.12 | _step_step: 344.47 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.593 | iteration 7510/ 143000 | elapsed time per iteration (ms): 61711.7 | learning rate: 5.973E-04 | approx flops per GPU: 71.6TFLOPS | lm_loss: 2.626417E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 10:16:24,339] [INFO] [logging.py:60:log_dist] [Rank 0] step=7520, skipped=0, lr=[0.0005973189458739932, 0.0005973189458739932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7520 loss: 2.6452 iter time (s): 62.301 samples/sec: 16.436 %comms: 0.002862866652943245 %optimizer_step 0.05596889190243596 %forward: 23.300726342590682 %backward: 62.60670155902168 [2025-03-30 10:16:24,339] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18330.37 | forward: 145165.12 | backward_microstep: 390050.13 | backward: 390044.04 | backward_inner_microstep: 390029.36 | backward_inner: 390023.76 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.18 | step: 348.69 | _step_clipping: 0.12 | _step_step: 347.05 | _step_zero_grad: 0.45 | _step_check_overflow: 0.53 samples/sec: 16.436 | iteration 7520/ 143000 | elapsed time per iteration (ms): 62301.2 | learning rate: 5.973E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.620209E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 10:26:47,944] [INFO] [logging.py:60:log_dist] [Rank 0] step=7530, skipped=0, lr=[0.0005973101470616254, 0.0005973101470616254], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7530 loss: 2.6104 iter time (s): 62.360 samples/sec: 16.421 %comms: 0.002860755698701148 %optimizer_step 0.056023020921092764 %forward: 23.2765430436515 %backward: 62.544523179675735 [2025-03-30 10:26:47,945] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18957.01 | forward: 145152.50 | backward_microstep: 390033.49 | backward: 390027.60 | backward_inner_microstep: 390011.17 | backward_inner: 390005.82 | backward_allreduce_microstep: 8.88 | backward_allreduce: 4.17 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.17 | step: 349.36 | _step_clipping: 0.12 | _step_step: 347.68 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.421 | iteration 7530/ 143000 | elapsed time per iteration (ms): 62360.5 | learning rate: 5.973E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.620027E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 10:37:10,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=7540, skipped=0, lr=[0.0005973013338997368, 0.0005973013338997368], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7540 loss: 2.6111 iter time (s): 62.225 samples/sec: 16.456 %comms: 0.0028618990741614923 %optimizer_step 0.05559826998578375 %forward: 23.32744721160144 %backward: 62.66629258485123 [2025-03-30 10:37:10,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17722.45 | forward: 145155.30 | backward_microstep: 389947.56 | backward: 389941.70 | backward_inner_microstep: 389926.96 | backward_inner: 389921.67 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.59 | reduce_tied_grads: 0.23 | comms: 17.81 | reduce_grads: 0.17 | step: 345.96 | _step_clipping: 0.11 | _step_step: 344.37 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.456 | iteration 7540/ 143000 | elapsed time per iteration (ms): 62225.6 | learning rate: 5.973E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.619804E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 10:47:32,258] [INFO] [logging.py:60:log_dist] [Rank 0] step=7550, skipped=0, lr=[0.0005972925063887526, 0.0005972925063887526], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7550 loss: 2.6021 iter time (s): 62.205 samples/sec: 16.462 %comms: 0.00289178840330951 %optimizer_step 0.0571831619197449 %forward: 23.344010542760884 %backward: 62.698763165898775 [2025-03-30 10:47:32,259] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17371.57 | forward: 145212.01 | backward_microstep: 390025.55 | backward: 390019.25 | backward_inner_microstep: 390004.36 | backward_inner: 389998.95 | backward_allreduce_microstep: 7.09 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.99 | reduce_grads: 0.18 | step: 355.71 | _step_clipping: 0.11 | _step_step: 354.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.461 | iteration 7550/ 143000 | elapsed time per iteration (ms): 62205.9 | learning rate: 5.973E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.613981E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 10:57:49,733] [INFO] [logging.py:60:log_dist] [Rank 0] step=7560, skipped=0, lr=[0.000597283664529099, 0.000597283664529099], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7560 loss: 2.6241 iter time (s): 61.747 samples/sec: 16.584 %comms: 0.0028893153247868146 %optimizer_step 0.05809860229478313 %forward: 23.505248429182537 %backward: 63.15944278893886 [2025-03-30 10:57:49,734] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12893.16 | forward: 145137.60 | backward_microstep: 389996.12 | backward: 389989.92 | backward_inner_microstep: 389969.88 | backward_inner: 389964.50 | backward_allreduce_microstep: 10.75 | backward_allreduce: 4.12 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.19 | step: 358.74 | _step_clipping: 0.11 | _step_step: 357.09 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.584 | iteration 7560/ 143000 | elapsed time per iteration (ms): 61747.4 | learning rate: 5.973E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.615336E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 11:08:06,840] [INFO] [logging.py:60:log_dist] [Rank 0] step=7570, skipped=0, lr=[0.0005972748083212026, 0.0005972748083212026], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7570 loss: 2.6188 iter time (s): 61.710 samples/sec: 16.594 %comms: 0.002885318204915697 %optimizer_step 0.05678734769333369 %forward: 23.518073992775115 %backward: 63.19651436004994 [2025-03-30 11:08:06,840] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12546.96 | forward: 145130.35 | backward_microstep: 389992.28 | backward: 389986.55 | backward_inner_microstep: 389970.42 | backward_inner: 389965.21 | backward_allreduce_microstep: 8.73 | backward_allreduce: 2.41 | reduce_tied_grads: 0.22 | comms: 17.81 | reduce_grads: 0.17 | step: 350.44 | _step_clipping: 0.10 | _step_step: 348.81 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.594 | iteration 7570/ 143000 | elapsed time per iteration (ms): 61710.7 | learning rate: 5.973E-04 | approx flops per GPU: 71.6TFLOPS | lm_loss: 2.623117E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 11:18:24,323] [INFO] [logging.py:60:log_dist] [Rank 0] step=7580, skipped=0, lr=[0.0005972659377654908, 0.0005972659377654908], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7580 loss: 2.6313 iter time (s): 61.748 samples/sec: 16.584 %comms: 0.002884599892058093 %optimizer_step 0.05810274867978308 %forward: 23.506718231719972 %backward: 63.151517594439774 [2025-03-30 11:18:24,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12933.86 | forward: 145148.86 | backward_microstep: 389952.89 | backward: 389946.85 | backward_inner_microstep: 389932.10 | backward_inner: 389926.74 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.23 | comms: 17.81 | reduce_grads: 0.17 | step: 358.77 | _step_clipping: 0.10 | _step_step: 357.20 | _step_zero_grad: 0.45 | _step_check_overflow: 0.49 samples/sec: 16.583 | iteration 7580/ 143000 | elapsed time per iteration (ms): 61748.3 | learning rate: 5.973E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.617454E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 11:28:50,815] [INFO] [logging.py:60:log_dist] [Rank 0] step=7590, skipped=0, lr=[0.0005972570528623921, 0.0005972570528623921], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7590 loss: 2.6318 iter time (s): 62.649 samples/sec: 16.345 %comms: 0.0028449069232576365 %optimizer_step 0.05532930102278269 %forward: 23.177421235205923 %backward: 62.25056568322589 [2025-03-30 11:28:50,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21832.43 | forward: 145203.60 | backward_microstep: 389998.27 | backward: 389991.89 | backward_inner_microstep: 389977.12 | backward_inner: 389971.57 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.82 | reduce_grads: 0.18 | step: 346.63 | _step_clipping: 0.11 | _step_step: 345.03 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.345 | iteration 7590/ 143000 | elapsed time per iteration (ms): 62649.2 | learning rate: 5.973E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.630660E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 11:39:16,311] [INFO] [logging.py:60:log_dist] [Rank 0] step=7600, skipped=0, lr=[0.0005972481536123351, 0.0005972481536123351], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7600 loss: 2.6345 iter time (s): 62.549 samples/sec: 16.371 %comms: 0.0028692606975865382 %optimizer_step 0.055164019283773805 %forward: 23.215835112339764 %backward: 62.35872480737268 [2025-03-30 11:39:16,312] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20772.34 | forward: 145212.89 | backward_microstep: 390054.33 | backward: 390048.02 | backward_inner_microstep: 390033.35 | backward_inner: 390027.90 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.95 | reduce_grads: 0.18 | step: 345.05 | _step_clipping: 0.12 | _step_step: 343.38 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.371 | iteration 7600/ 143000 | elapsed time per iteration (ms): 62549.6 | learning rate: 5.972E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.631056E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 11:49:43,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=7610, skipped=0, lr=[0.0005972392400157491, 0.0005972392400157491], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7610 loss: 2.6071 iter time (s): 62.669 samples/sec: 16.340 %comms: 0.0028516521500943043 %optimizer_step 0.05605328690735891 %forward: 23.174328255274578 %backward: 62.235676529810355 [2025-03-30 11:49:43,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21981.65 | forward: 145230.26 | backward_microstep: 390030.13 | backward: 390022.23 | backward_inner_microstep: 390007.53 | backward_inner: 390002.00 | backward_allreduce_microstep: 7.11 | backward_allreduce: 2.44 | reduce_tied_grads: 0.25 | comms: 17.87 | reduce_grads: 0.18 | step: 351.28 | _step_clipping: 0.13 | _step_step: 349.67 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.340 | iteration 7610/ 143000 | elapsed time per iteration (ms): 62669.1 | learning rate: 5.972E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.629097E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 12:00:05,369] [INFO] [logging.py:60:log_dist] [Rank 0] step=7620, skipped=0, lr=[0.0005972303120730647, 0.0005972303120730647], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7620 loss: 2.6098 iter time (s): 62.236 samples/sec: 16.453 %comms: 0.0028760263395927617 %optimizer_step 0.056516915229488915 %forward: 23.33385792130385 %backward: 62.68251467623791 [2025-03-30 12:00:05,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17552.15 | forward: 145220.90 | backward_microstep: 390120.14 | backward: 390111.71 | backward_inner_microstep: 390095.06 | backward_inner: 390087.51 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.43 | reduce_tied_grads: 0.30 | comms: 17.90 | reduce_grads: 0.18 | step: 351.74 | _step_clipping: 0.12 | _step_step: 350.04 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.453 | iteration 7620/ 143000 | elapsed time per iteration (ms): 62236.7 | learning rate: 5.972E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.612659E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 12:10:28,199] [INFO] [logging.py:60:log_dist] [Rank 0] step=7630, skipped=0, lr=[0.0005972213697847126, 0.0005972213697847126], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7630 loss: 2.6492 iter time (s): 62.282 samples/sec: 16.441 %comms: 0.0028835727711628146 %optimizer_step 0.05652235963623466 %forward: 23.326738607611794 %backward: 62.65227155657646 [2025-03-30 12:10:28,199] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17824.42 | forward: 145284.62 | backward_microstep: 390222.82 | backward: 390213.63 | backward_inner_microstep: 390198.36 | backward_inner: 390192.44 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.50 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.18 | step: 352.04 | _step_clipping: 0.14 | _step_step: 350.28 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.441 | iteration 7630/ 143000 | elapsed time per iteration (ms): 62283.0 | learning rate: 5.972E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.625562E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 12:20:55,576] [INFO] [logging.py:60:log_dist] [Rank 0] step=7640, skipped=0, lr=[0.0005972124131511243, 0.0005972124131511243], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7640 loss: 2.6211 iter time (s): 62.737 samples/sec: 16.322 %comms: 0.002843516199040248 %optimizer_step 0.05561904612383929 %forward: 23.156667214291062 %backward: 62.17898034488516 [2025-03-30 12:20:55,577] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22552.54 | forward: 145278.51 | backward_microstep: 390104.18 | backward: 390093.67 | backward_inner_microstep: 390075.50 | backward_inner: 390069.96 | backward_allreduce_microstep: 10.58 | backward_allreduce: 5.93 | reduce_tied_grads: 0.24 | comms: 17.84 | reduce_grads: 0.18 | step: 348.94 | _step_clipping: 0.12 | _step_step: 347.30 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.322 | iteration 7640/ 143000 | elapsed time per iteration (ms): 62737.7 | learning rate: 5.972E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.631239E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 12:31:22,949] [INFO] [logging.py:60:log_dist] [Rank 0] step=7650, skipped=0, lr=[0.0005972034421727323, 0.0005972034421727323], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7650 loss: 2.6182 iter time (s): 62.737 samples/sec: 16.322 %comms: 0.0028396976364685775 %optimizer_step 0.05498128597530281 %forward: 23.14878706678012 %backward: 62.1673671712106 [2025-03-30 12:31:22,950] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22668.91 | forward: 145228.06 | backward_microstep: 390024.57 | backward: 390018.11 | backward_inner_microstep: 390003.18 | backward_inner: 389997.63 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.17 | step: 344.93 | _step_clipping: 0.12 | _step_step: 343.29 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.322 | iteration 7650/ 143000 | elapsed time per iteration (ms): 62737.3 | learning rate: 5.972E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.628852E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 12:41:49,027] [INFO] [logging.py:60:log_dist] [Rank 0] step=7660, skipped=0, lr=[0.0005971944568499694, 0.0005971944568499694], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7660 loss: 2.6275 iter time (s): 62.607 samples/sec: 16.356 %comms: 0.002865680447627123 %optimizer_step 0.05605550627020924 %forward: 23.188551211514856 %backward: 62.2961715949086 [2025-03-30 12:41:49,028] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21409.20 | forward: 145177.14 | backward_microstep: 390024.95 | backward: 390019.18 | backward_inner_microstep: 390002.71 | backward_inner: 389997.02 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.18 | step: 350.95 | _step_clipping: 0.13 | _step_step: 349.20 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.356 | iteration 7660/ 143000 | elapsed time per iteration (ms): 62607.8 | learning rate: 5.972E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.612869E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 12:52:17,132] [INFO] [logging.py:60:log_dist] [Rank 0] step=7670, skipped=0, lr=[0.0005971854571832694, 0.0005971854571832694], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7670 loss: 2.6198 iter time (s): 62.810 samples/sec: 16.303 %comms: 0.002844056783990216 %optimizer_step 0.05564891984634566 %forward: 23.11541122340785 %backward: 62.09073480543228 [2025-03-30 12:52:17,133] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23482.74 | forward: 145187.83 | backward_microstep: 389997.45 | backward: 389991.72 | backward_inner_microstep: 389976.88 | backward_inner: 389971.56 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.44 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.18 | step: 349.53 | _step_clipping: 0.11 | _step_step: 347.97 | _step_zero_grad: 0.45 | _step_check_overflow: 0.46 samples/sec: 16.303 | iteration 7670/ 143000 | elapsed time per iteration (ms): 62810.5 | learning rate: 5.972E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.618710E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 13:02:43,862] [INFO] [logging.py:60:log_dist] [Rank 0] step=7680, skipped=0, lr=[0.0005971764431730666, 0.0005971764431730666], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7680 loss: 2.6261 iter time (s): 62.672 samples/sec: 16.339 %comms: 0.002911929613822664 %optimizer_step 0.056120562448519505 %forward: 23.177401142760022 %backward: 62.253262806480514 [2025-03-30 13:02:43,862] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21825.94 | forward: 145258.24 | backward_microstep: 390163.32 | backward: 390155.89 | backward_inner_microstep: 390140.95 | backward_inner: 390135.20 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.46 | reduce_tied_grads: 0.27 | comms: 18.25 | reduce_grads: 0.18 | step: 351.72 | _step_clipping: 0.13 | _step_step: 350.04 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.339 | iteration 7680/ 143000 | elapsed time per iteration (ms): 62672.9 | learning rate: 5.972E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.616528E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 13:13:05,778] [INFO] [logging.py:60:log_dist] [Rank 0] step=7690, skipped=0, lr=[0.000597167414819796, 0.000597167414819796], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7690 loss: 2.6267 iter time (s): 62.191 samples/sec: 16.465 %comms: 0.002863652398654621 %optimizer_step 0.05614538277045746 %forward: 23.3392887307648 %backward: 62.72152262660651 [2025-03-30 13:13:05,779] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17253.38 | forward: 145149.78 | backward_microstep: 390078.60 | backward: 390072.53 | backward_inner_microstep: 390057.67 | backward_inner: 390052.17 | backward_allreduce_microstep: 7.09 | backward_allreduce: 2.43 | reduce_tied_grads: 0.24 | comms: 17.81 | reduce_grads: 0.17 | step: 349.17 | _step_clipping: 0.12 | _step_step: 347.53 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.465 | iteration 7690/ 143000 | elapsed time per iteration (ms): 62191.7 | learning rate: 5.972E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.622045E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 13:23:28,855] [INFO] [logging.py:60:log_dist] [Rank 0] step=7700, skipped=0, lr=[0.0005971583721238935, 0.0005971583721238935], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7700 loss: 2.6083 iter time (s): 62.307 samples/sec: 16.435 %comms: 0.0028681960221834412 %optimizer_step 0.05767398224482888 %forward: 23.298683913660927 %backward: 62.60244063277535 [2025-03-30 13:23:28,855] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18351.59 | forward: 145167.39 | backward_microstep: 390066.20 | backward: 390057.77 | backward_inner_microstep: 390043.10 | backward_inner: 390037.57 | backward_allreduce_microstep: 7.05 | backward_allreduce: 2.41 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 359.35 | _step_clipping: 0.14 | _step_step: 357.68 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.435 | iteration 7700/ 143000 | elapsed time per iteration (ms): 62307.6 | learning rate: 5.972E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.621577E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 13:33:56,070] [INFO] [logging.py:60:log_dist] [Rank 0] step=7710, skipped=0, lr=[0.0005971493150857955, 0.0005971493150857955], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7710 loss: 2.6166 iter time (s): 62.721 samples/sec: 16.326 %comms: 0.00285352670186756 %optimizer_step 0.05570158778709335 %forward: 23.15873143805788 %backward: 62.19571686212239 [2025-03-30 13:33:56,071] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22359.15 | forward: 145253.89 | backward_microstep: 390104.51 | backward: 390097.78 | backward_inner_microstep: 390082.72 | backward_inner: 390077.05 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.19 | step: 349.37 | _step_clipping: 0.11 | _step_step: 347.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.326 | iteration 7710/ 143000 | elapsed time per iteration (ms): 62721.5 | learning rate: 5.971E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.608836E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 13:44:21,746] [INFO] [logging.py:60:log_dist] [Rank 0] step=7720, skipped=0, lr=[0.0005971402437059391, 0.0005971402437059391], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7720 loss: 2.6267 iter time (s): 62.567 samples/sec: 16.366 %comms: 0.002861956225102031 %optimizer_step 0.05565036821559926 %forward: 23.20021494870747 %backward: 62.32412863164283 [2025-03-30 13:44:21,747] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21132.24 | forward: 145156.99 | backward_microstep: 389951.75 | backward: 389943.94 | backward_inner_microstep: 389926.89 | backward_inner: 389917.95 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.91 | reduce_grads: 0.18 | step: 348.19 | _step_clipping: 0.15 | _step_step: 346.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.366 | iteration 7720/ 143000 | elapsed time per iteration (ms): 62567.6 | learning rate: 5.971E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.616287E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 13:54:47,349] [INFO] [logging.py:60:log_dist] [Rank 0] step=7730, skipped=0, lr=[0.0005971311579847621, 0.0005971311579847621], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7730 loss: 2.6322 iter time (s): 62.560 samples/sec: 16.368 %comms: 0.002877384678284789 %optimizer_step 0.055968468571341204 %forward: 23.21576986502568 %backward: 62.37323993086793 [2025-03-30 13:54:47,350] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20668.17 | forward: 145237.23 | backward_microstep: 390214.57 | backward: 390205.31 | backward_inner_microstep: 390190.20 | backward_inner: 390184.37 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.18 | step: 350.14 | _step_clipping: 0.13 | _step_step: 348.46 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.368 | iteration 7730/ 143000 | elapsed time per iteration (ms): 62560.4 | learning rate: 5.971E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.617788E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 14:05:08,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=7740, skipped=0, lr=[0.0005971220579227029, 0.0005971220579227029], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7740 loss: 2.6218 iter time (s): 62.115 samples/sec: 16.485 %comms: 0.0028707114346840667 %optimizer_step 0.055993784815897195 %forward: 23.36878183765547 %backward: 62.79055303881363 [2025-03-30 14:05:08,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16521.05 | forward: 145156.33 | backward_microstep: 390032.30 | backward: 390026.58 | backward_inner_microstep: 390012.30 | backward_inner: 390007.09 | backward_allreduce_microstep: 6.92 | backward_allreduce: 2.37 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.17 | step: 347.81 | _step_clipping: 0.11 | _step_step: 346.16 | _step_zero_grad: 0.45 | _step_check_overflow: 0.56 samples/sec: 16.485 | iteration 7740/ 143000 | elapsed time per iteration (ms): 62116.0 | learning rate: 5.971E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.615075E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 14:15:31,399] [INFO] [logging.py:60:log_dist] [Rank 0] step=7750, skipped=0, lr=[0.000597112943520201, 0.000597112943520201], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7750 loss: 2.6194 iter time (s): 62.288 samples/sec: 16.440 %comms: 0.0028755327364097124 %optimizer_step 0.05696314563985342 %forward: 23.314401177941917 %backward: 62.61833183709173 [2025-03-30 14:15:31,399] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18155.87 | forward: 145221.37 | backward_microstep: 390044.72 | backward: 390038.75 | backward_inner_microstep: 390022.33 | backward_inner: 390017.02 | backward_allreduce_microstep: 7.03 | backward_allreduce: 2.41 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.19 | step: 354.81 | _step_clipping: 0.15 | _step_step: 353.05 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.440 | iteration 7750/ 143000 | elapsed time per iteration (ms): 62288.8 | learning rate: 5.971E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.614806E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 14:25:52,984] [INFO] [logging.py:60:log_dist] [Rank 0] step=7760, skipped=0, lr=[0.0005971038147776961, 0.0005971038147776961], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7760 loss: 2.6047 iter time (s): 62.158 samples/sec: 16.474 %comms: 0.002966862708467865 %optimizer_step 0.05671161385102415 %forward: 23.36842337406789 %backward: 62.74901502701137 [2025-03-30 14:25:52,985] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16834.56 | forward: 145253.55 | backward_microstep: 390042.11 | backward: 390035.61 | backward_inner_microstep: 390020.83 | backward_inner: 390015.35 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.45 | reduce_tied_grads: 0.27 | comms: 18.44 | reduce_grads: 0.18 | step: 352.51 | _step_clipping: 0.11 | _step_step: 350.93 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.474 | iteration 7760/ 143000 | elapsed time per iteration (ms): 62158.6 | learning rate: 5.971E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.612564E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 14:36:14,793] [INFO] [logging.py:60:log_dist] [Rank 0] step=7770, skipped=0, lr=[0.000597094671695629, 0.000597094671695629], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7770 loss: 2.6236 iter time (s): 62.180 samples/sec: 16.468 %comms: 0.002913576137822244 %optimizer_step 0.05531929678711297 %forward: 23.938092773481955 %backward: 62.72977691008974 [2025-03-30 14:36:14,793] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13429.82 | forward: 148847.85 | backward_microstep: 390062.38 | backward: 390055.82 | backward_inner_microstep: 390040.70 | backward_inner: 390033.19 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.53 | reduce_tied_grads: 0.42 | comms: 18.12 | reduce_grads: 0.18 | step: 343.98 | _step_clipping: 0.11 | _step_step: 342.30 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.468 | iteration 7770/ 143000 | elapsed time per iteration (ms): 62180.9 | learning rate: 5.971E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.613882E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 14:46:41,478] [INFO] [logging.py:60:log_dist] [Rank 0] step=7780, skipped=0, lr=[0.0005970855142744407, 0.0005970855142744407], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7780 loss: 2.6272 iter time (s): 62.668 samples/sec: 16.340 %comms: 0.002846011203161963 %optimizer_step 0.05579117097360543 %forward: 23.165184791176053 %backward: 62.236410768069184 [2025-03-30 14:46:41,479] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22013.21 | forward: 145171.54 | backward_microstep: 390028.91 | backward: 390023.04 | backward_inner_microstep: 390008.59 | backward_inner: 390003.28 | backward_allreduce_microstep: 7.05 | backward_allreduce: 2.43 | reduce_tied_grads: 0.24 | comms: 17.84 | reduce_grads: 0.17 | step: 349.63 | _step_clipping: 0.11 | _step_step: 347.99 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.340 | iteration 7780/ 143000 | elapsed time per iteration (ms): 62668.5 | learning rate: 5.971E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.615474E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 14:57:04,300] [INFO] [logging.py:60:log_dist] [Rank 0] step=7790, skipped=0, lr=[0.0005970763425145732, 0.0005970763425145732], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7790 loss: 2.6042 iter time (s): 62.282 samples/sec: 16.441 %comms: 0.0029059641129754936 %optimizer_step 0.05626194122255316 %forward: 23.30484079262635 %backward: 62.623255913547396 [2025-03-30 14:57:04,301] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18166.86 | forward: 145146.45 | backward_microstep: 390034.28 | backward: 390028.13 | backward_inner_microstep: 390013.56 | backward_inner: 390008.03 | backward_allreduce_microstep: 7.05 | backward_allreduce: 2.41 | reduce_tied_grads: 0.30 | comms: 18.10 | reduce_grads: 0.19 | step: 350.41 | _step_clipping: 0.12 | _step_step: 348.75 | _step_zero_grad: 0.48 | _step_check_overflow: 0.48 samples/sec: 16.441 | iteration 7790/ 143000 | elapsed time per iteration (ms): 62282.3 | learning rate: 5.971E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.607879E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 15:07:26,920] [INFO] [logging.py:60:log_dist] [Rank 0] step=7800, skipped=0, lr=[0.0005970671564164695, 0.0005970671564164695], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7800 loss: 2.6182 iter time (s): 62.261 samples/sec: 16.447 %comms: 0.0029007877933005456 %optimizer_step 0.056332883847421585 %forward: 23.32144131982867 %backward: 62.654097090167774 [2025-03-30 15:07:26,921] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17827.55 | forward: 145202.34 | backward_microstep: 390098.95 | backward: 390092.60 | backward_inner_microstep: 390077.62 | backward_inner: 390070.25 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.20 | step: 350.74 | _step_clipping: 0.14 | _step_step: 349.01 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.447 | iteration 7800/ 143000 | elapsed time per iteration (ms): 62261.9 | learning rate: 5.971E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.604579E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 15:17:53,235] [INFO] [logging.py:60:log_dist] [Rank 0] step=7810, skipped=0, lr=[0.0005970579559805726, 0.0005970579559805726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7810 loss: 2.6018 iter time (s): 62.631 samples/sec: 16.350 %comms: 0.002854812746577889 %optimizer_step 0.05596455468696469 %forward: 23.184759459918567 %backward: 62.299376900782896 [2025-03-30 15:17:53,235] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21387.45 | forward: 145208.35 | backward_microstep: 390195.29 | backward: 390186.90 | backward_inner_microstep: 390171.99 | backward_inner: 390164.69 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.42 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.17 | step: 350.51 | _step_clipping: 0.11 | _step_step: 348.85 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.350 | iteration 7810/ 143000 | elapsed time per iteration (ms): 62631.5 | learning rate: 5.971E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.605405E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 15:28:15,683] [INFO] [logging.py:60:log_dist] [Rank 0] step=7820, skipped=0, lr=[0.0005970487412073266, 0.0005970487412073266], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7820 loss: 2.6110 iter time (s): 62.244 samples/sec: 16.451 %comms: 0.0028862571902299352 %optimizer_step 0.05632342146310194 %forward: 23.337310809373733 %backward: 62.689863506613804 [2025-03-30 15:28:15,684] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17451.41 | forward: 145261.55 | backward_microstep: 390215.60 | backward: 390208.90 | backward_inner_microstep: 390192.29 | backward_inner: 390186.64 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.18 | step: 350.58 | _step_clipping: 0.12 | _step_step: 348.88 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.451 | iteration 7820/ 143000 | elapsed time per iteration (ms): 62244.9 | learning rate: 5.970E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.611237E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 15:38:42,528] [INFO] [logging.py:60:log_dist] [Rank 0] step=7830, skipped=0, lr=[0.0005970395120971766, 0.0005970395120971766], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7830 loss: 2.5970 iter time (s): 62.684 samples/sec: 16.336 %comms: 0.002848826766132483 %optimizer_step 0.05585666874754634 %forward: 23.162207333279845 %backward: 62.2302634826817 [2025-03-30 15:38:42,528] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22051.40 | forward: 145189.70 | backward_microstep: 390091.71 | backward: 390083.44 | backward_inner_microstep: 390066.75 | backward_inner: 390061.11 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.45 | reduce_tied_grads: 0.29 | comms: 17.86 | reduce_grads: 0.18 | step: 350.13 | _step_clipping: 0.12 | _step_step: 348.36 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.336 | iteration 7830/ 143000 | elapsed time per iteration (ms): 62684.4 | learning rate: 5.970E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.606618E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 15:49:00,787] [INFO] [logging.py:60:log_dist] [Rank 0] step=7840, skipped=0, lr=[0.0005970302686505675, 0.0005970302686505675], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7840 loss: 2.6198 iter time (s): 61.825 samples/sec: 16.563 %comms: 0.0028913517920601643 %optimizer_step 0.056468339205039464 %forward: 23.48739127863423 %backward: 63.10109084517194 [2025-03-30 15:49:00,788] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13385.44 | forward: 145211.83 | backward_microstep: 390132.65 | backward: 390125.27 | backward_inner_microstep: 390109.89 | backward_inner: 390104.06 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.18 | step: 349.12 | _step_clipping: 0.12 | _step_step: 347.49 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.563 | iteration 7840/ 143000 | elapsed time per iteration (ms): 61826.0 | learning rate: 5.970E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.609128E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 15:59:28,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=7850, skipped=0, lr=[0.0005970210108679459, 0.0005970210108679459], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7850 loss: 2.5977 iter time (s): 62.780 samples/sec: 16.311 %comms: 0.002848608142581461 %optimizer_step 0.05498361721628078 %forward: 23.145701227844253 %backward: 62.135233988001126 [2025-03-30 15:59:28,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22912.54 | forward: 145308.53 | backward_microstep: 390091.56 | backward: 390084.50 | backward_inner_microstep: 390069.45 | backward_inner: 390062.00 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.19 | step: 345.19 | _step_clipping: 0.13 | _step_step: 343.55 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.311 | iteration 7850/ 143000 | elapsed time per iteration (ms): 62780.4 | learning rate: 5.970E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.604385E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 16:09:46,742] [INFO] [logging.py:60:log_dist] [Rank 0] step=7860, skipped=0, lr=[0.0005970117387497584, 0.0005970117387497584], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7860 loss: 2.6006 iter time (s): 61.815 samples/sec: 16.566 %comms: 0.0028922086323952306 %optimizer_step 0.056572906057824515 %forward: 23.49301676715547 %backward: 63.117077556575104 [2025-03-30 16:09:46,743] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13253.23 | forward: 145221.01 | backward_microstep: 390162.52 | backward: 390155.33 | backward_inner_microstep: 390140.23 | backward_inner: 390134.52 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.44 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 349.70 | _step_clipping: 0.11 | _step_step: 348.07 | _step_zero_grad: 0.45 | _step_check_overflow: 0.53 samples/sec: 16.566 | iteration 7860/ 143000 | elapsed time per iteration (ms): 61815.1 | learning rate: 5.970E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.607585E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 16:20:08,833] [INFO] [logging.py:60:log_dist] [Rank 0] step=7870, skipped=0, lr=[0.0005970024522964525, 0.0005970024522964525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7870 loss: 2.6239 iter time (s): 62.209 samples/sec: 16.461 %comms: 0.002862470003742869 %optimizer_step 0.05565627730927812 %forward: 23.34102885955419 %backward: 62.70893858439111 [2025-03-30 16:20:08,833] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17288.31 | forward: 145201.12 | backward_microstep: 390111.90 | backward: 390103.13 | backward_inner_microstep: 390084.38 | backward_inner: 390078.65 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.46 | reduce_tied_grads: 0.24 | comms: 17.81 | reduce_grads: 0.18 | step: 346.23 | _step_clipping: 0.11 | _step_step: 344.63 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.461 | iteration 7870/ 143000 | elapsed time per iteration (ms): 62209.0 | learning rate: 5.970E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.616659E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 16:30:33,840] [INFO] [logging.py:60:log_dist] [Rank 0] step=7880, skipped=0, lr=[0.0005969931515084764, 0.0005969931515084764], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7880 loss: 2.6041 iter time (s): 62.500 samples/sec: 16.384 %comms: 0.0028480048255445697 %optimizer_step 0.055007245400722445 %forward: 23.236639934748155 %backward: 62.39927471263118 [2025-03-30 16:30:33,841] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20303.39 | forward: 145229.51 | backward_microstep: 390004.62 | backward: 389996.84 | backward_inner_microstep: 389978.53 | backward_inner: 389972.98 | backward_allreduce_microstep: 10.67 | backward_allreduce: 5.99 | reduce_tied_grads: 0.26 | comms: 17.80 | reduce_grads: 0.18 | step: 343.80 | _step_clipping: 0.14 | _step_step: 342.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.384 | iteration 7880/ 143000 | elapsed time per iteration (ms): 62500.7 | learning rate: 5.970E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.611038E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 16:41:00,102] [INFO] [logging.py:60:log_dist] [Rank 0] step=7890, skipped=0, lr=[0.0005969838363862792, 0.0005969838363862792], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7890 loss: 2.5934 iter time (s): 62.626 samples/sec: 16.351 %comms: 0.002854027357589948 %optimizer_step 0.055412528494984015 %forward: 23.199075187777474 %backward: 62.28084593218597 [2025-03-30 16:41:00,102] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21444.52 | forward: 145285.67 | backward_microstep: 390043.63 | backward: 390037.71 | backward_inner_microstep: 390022.97 | backward_inner: 390017.45 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.44 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 347.02 | _step_clipping: 0.12 | _step_step: 345.41 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.351 | iteration 7890/ 143000 | elapsed time per iteration (ms): 62626.2 | learning rate: 5.970E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.610619E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 16:51:22,056] [INFO] [logging.py:60:log_dist] [Rank 0] step=7900, skipped=0, lr=[0.0005969745069303102, 0.0005969745069303102], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7900 loss: 2.6140 iter time (s): 62.195 samples/sec: 16.464 %comms: 0.002876706640772839 %optimizer_step 0.057148299995082086 %forward: 23.346239100640187 %backward: 62.72477049854036 [2025-03-30 16:51:22,056] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17132.28 | forward: 145201.68 | backward_microstep: 390122.37 | backward: 390116.02 | backward_inner_microstep: 390101.21 | backward_inner: 390095.56 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.44 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 355.43 | _step_clipping: 2.01 | _step_step: 351.87 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.464 | iteration 7900/ 143000 | elapsed time per iteration (ms): 62195.4 | learning rate: 5.970E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.603615E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 17:01:39,970] [INFO] [logging.py:60:log_dist] [Rank 0] step=7910, skipped=0, lr=[0.00059696516314102, 0.00059696516314102], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7910 loss: 2.6039 iter time (s): 61.791 samples/sec: 16.572 %comms: 0.002903773036132866 %optimizer_step 0.05638679884500667 %forward: 23.4990792592816 %backward: 63.12661615002666 [2025-03-30 17:01:39,970] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13173.23 | forward: 145202.86 | backward_microstep: 390073.01 | backward: 390064.87 | backward_inner_microstep: 390046.42 | backward_inner: 390037.48 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.46 | reduce_tied_grads: 0.24 | comms: 17.94 | reduce_grads: 0.17 | step: 348.42 | _step_clipping: 0.10 | _step_step: 346.78 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.572 | iteration 7910/ 143000 | elapsed time per iteration (ms): 61791.4 | learning rate: 5.970E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.607405E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 17:12:02,129] [INFO] [logging.py:60:log_dist] [Rank 0] step=7920, skipped=0, lr=[0.0005969558050188592, 0.0005969558050188592], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7920 loss: 2.6057 iter time (s): 62.215 samples/sec: 16.459 %comms: 0.002874109256203023 %optimizer_step 0.05614208378153487 %forward: 23.34403034261675 %backward: 62.691196520842155 [2025-03-30 17:12:02,129] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17395.21 | forward: 145235.88 | backward_microstep: 390042.46 | backward: 390035.96 | backward_inner_microstep: 390015.77 | backward_inner: 390010.17 | backward_allreduce_microstep: 10.73 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 349.29 | _step_clipping: 0.12 | _step_step: 347.68 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.459 | iteration 7920/ 143000 | elapsed time per iteration (ms): 62215.9 | learning rate: 5.970E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.605145E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 17:22:28,202] [INFO] [logging.py:60:log_dist] [Rank 0] step=7930, skipped=0, lr=[0.0005969464325642798, 0.0005969464325642798], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7930 loss: 2.6099 iter time (s): 62.607 samples/sec: 16.356 %comms: 0.0028479183952320446 %optimizer_step 0.055052634321887936 %forward: 23.190062423039816 %backward: 62.301806554017304 [2025-03-30 17:22:28,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21344.05 | forward: 145185.47 | backward_microstep: 390057.46 | backward: 390051.44 | backward_inner_microstep: 390036.78 | backward_inner: 390031.16 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 344.67 | _step_clipping: 0.13 | _step_step: 343.05 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.356 | iteration 7930/ 143000 | elapsed time per iteration (ms): 62607.3 | learning rate: 5.969E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.597189E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 17:32:54,856] [INFO] [logging.py:60:log_dist] [Rank 0] step=7940, skipped=0, lr=[0.0005969370457777339, 0.0005969370457777339], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7940 loss: 2.6202 iter time (s): 62.665 samples/sec: 16.341 %comms: 0.002857296428265401 %optimizer_step 0.05592375901583838 %forward: 23.18822080757328 %backward: 62.25697850195202 [2025-03-30 17:32:54,857] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21714.87 | forward: 145308.88 | backward_microstep: 390141.05 | backward: 390133.08 | backward_inner_microstep: 390116.62 | backward_inner: 390111.10 | backward_allreduce_microstep: 8.86 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.91 | reduce_grads: 0.18 | step: 350.45 | _step_clipping: 0.33 | _step_step: 348.59 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.341 | iteration 7940/ 143000 | elapsed time per iteration (ms): 62665.5 | learning rate: 5.969E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.609046E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 17:43:16,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=7950, skipped=0, lr=[0.0005969276446596747, 0.0005969276446596747], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7950 loss: 2.6065 iter time (s): 62.135 samples/sec: 16.480 %comms: 0.0028680531975831284 %optimizer_step 0.055373437279178706 %forward: 23.359999524627234 %backward: 62.7637483131611 [2025-03-30 17:43:16,211] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16767.05 | forward: 145146.93 | backward_microstep: 389987.25 | backward: 389981.42 | backward_inner_microstep: 389967.02 | backward_inner: 389961.70 | backward_allreduce_microstep: 7.01 | backward_allreduce: 2.41 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.18 | step: 344.06 | _step_clipping: 0.12 | _step_step: 342.46 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.480 | iteration 7950/ 143000 | elapsed time per iteration (ms): 62135.4 | learning rate: 5.969E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.605762E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 17:53:42,150] [INFO] [logging.py:60:log_dist] [Rank 0] step=7960, skipped=0, lr=[0.000596918229210556, 0.000596918229210556], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7960 loss: 2.6055 iter time (s): 62.593 samples/sec: 16.360 %comms: 0.0028568287438669555 %optimizer_step 0.055711817148687295 %forward: 23.207253013197953 %backward: 62.32116396138947 [2025-03-30 17:53:42,150] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21101.27 | forward: 145262.15 | backward_microstep: 390095.76 | backward: 390089.53 | backward_inner_microstep: 390074.79 | backward_inner: 390069.28 | backward_allreduce_microstep: 7.10 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 348.72 | _step_clipping: 0.13 | _step_step: 347.14 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.359 | iteration 7960/ 143000 | elapsed time per iteration (ms): 62594.0 | learning rate: 5.969E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.606655E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 18:04:03,582] [INFO] [logging.py:60:log_dist] [Rank 0] step=7970, skipped=0, lr=[0.000596908799430832, 0.000596908799430832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7970 loss: 2.6236 iter time (s): 62.143 samples/sec: 16.478 %comms: 0.0028755525672555797 %optimizer_step 0.0558195972285993 %forward: 23.358288991174547 %backward: 62.76336953847509 [2025-03-30 18:04:03,583] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16768.54 | forward: 145154.82 | backward_microstep: 390034.48 | backward: 390028.80 | backward_inner_microstep: 390014.37 | backward_inner: 390009.01 | backward_allreduce_microstep: 7.02 | backward_allreduce: 2.43 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.17 | step: 346.88 | _step_clipping: 0.11 | _step_step: 345.26 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.478 | iteration 7970/ 143000 | elapsed time per iteration (ms): 62143.3 | learning rate: 5.969E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.601210E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 18:14:30,220] [INFO] [logging.py:60:log_dist] [Rank 0] step=7980, skipped=0, lr=[0.000596899355320958, 0.000596899355320958], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7980 loss: 2.6072 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.0028525780257041226 %optimizer_step 0.05558497913148594 %forward: 23.179003565921903 %backward: 62.25855837080309 [2025-03-30 18:14:30,221] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21737.67 | forward: 145247.28 | backward_microstep: 390140.36 | backward: 390132.66 | backward_inner_microstep: 390117.35 | backward_inner: 390111.47 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.18 | step: 348.31 | _step_clipping: 0.13 | _step_step: 346.65 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.341 | iteration 7980/ 143000 | elapsed time per iteration (ms): 62663.8 | learning rate: 5.969E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.605468E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 18:24:51,252] [INFO] [logging.py:60:log_dist] [Rank 0] step=7990, skipped=0, lr=[0.0005968898968813899, 0.0005968898968813899], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 7990 loss: 2.5893 iter time (s): 62.103 samples/sec: 16.489 %comms: 0.002873226571672585 %optimizer_step 0.05874425958433335 %forward: 23.368727015207188 %backward: 62.80154516420878 [2025-03-30 18:24:51,252] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16376.59 | forward: 145125.88 | backward_microstep: 390019.72 | backward: 390013.94 | backward_inner_microstep: 389999.52 | backward_inner: 389994.24 | backward_allreduce_microstep: 7.05 | backward_allreduce: 2.40 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.17 | step: 364.82 | _step_clipping: 0.12 | _step_step: 363.20 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.489 | iteration 7990/ 143000 | elapsed time per iteration (ms): 62103.1 | learning rate: 5.969E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.599721E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 18:35:18,195] [INFO] [logging.py:60:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.0005968804241125839, 0.0005968804241125839], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8000 loss: 2.5993 iter time (s): 62.694 samples/sec: 16.333 %comms: 0.002847386427317246 %optimizer_step 0.055503005272928486 %forward: 23.16246495852507 %backward: 62.2222680227092 [2025-03-30 18:35:18,196] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22138.80 | forward: 145214.34 | backward_microstep: 390101.34 | backward: 390095.16 | backward_inner_microstep: 390080.38 | backward_inner: 390074.90 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.46 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 347.97 | _step_clipping: 0.11 | _step_step: 346.37 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.333 | iteration 8000/ 143000 | elapsed time per iteration (ms): 62694.3 | learning rate: 5.969E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.606992E+00 | loss scale: 1048576.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 18:35:21,011] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step8000/mp_rank_00_model_states.pt [2025-03-30 18:35:34,731] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-30 18:35:34,737] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step8000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-30 18:37:38,146] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 1048576.0 [2025-03-30 18:38:39,803] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 524288.0 [2025-03-30 18:45:57,511] [INFO] [logging.py:60:log_dist] [Rank 0] step=8010, skipped=2, lr=[0.0005968728355807952, 0.0005968728355807952], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8010 loss: 2.6013 iter time (s): 62.276 samples/sec: 16.443 %comms: 0.002336958069948925 %optimizer_step 0.04642664561107848 %forward: 23.333256587092293 %backward: 62.63950397252187 [2025-03-30 18:45:57,512] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17888.37 | forward: 145309.26 | backward_microstep: 390098.04 | backward: 390091.28 | backward_inner_microstep: 390076.30 | backward_inner: 390070.71 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.49 | reduce_tied_grads: 0.26 | comms: 14.55 | reduce_grads: 0.18 | step: 289.12 | _step_clipping: 0.12 | _step_step: 287.52 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.017 | iteration 8010/ 143000 | elapsed time per iteration (ms): 63931.6 | learning rate: 5.969E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.616904E+00 | loss scale: 524288.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-03-30 18:56:26,545] [INFO] [logging.py:60:log_dist] [Rank 0] step=8020, skipped=2, lr=[0.000596863337020514, 0.000596863337020514], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8020 loss: 2.6268 iter time (s): 62.903 samples/sec: 16.279 %comms: 0.002836447619401977 %optimizer_step 0.05562780351065775 %forward: 23.070868524945833 %backward: 61.998032886905094 [2025-03-30 18:56:26,545] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24454.97 | forward: 145122.26 | backward_microstep: 389990.68 | backward: 389985.09 | backward_inner_microstep: 389970.43 | backward_inner: 389965.14 | backward_allreduce_microstep: 7.05 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.17 | step: 349.91 | _step_clipping: 0.11 | _step_step: 348.28 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.279 | iteration 8020/ 143000 | elapsed time per iteration (ms): 62903.4 | learning rate: 5.969E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.612896E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 19:06:52,942] [INFO] [logging.py:60:log_dist] [Rank 0] step=8030, skipped=2, lr=[0.000596853824132277, 0.000596853824132277], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8030 loss: 2.6096 iter time (s): 62.639 samples/sec: 16.348 %comms: 0.002848005690381217 %optimizer_step 0.05540778418414947 %forward: 23.178317759555174 %backward: 62.271981156654974 [2025-03-30 19:06:52,942] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21634.45 | forward: 145187.05 | backward_microstep: 390073.18 | backward: 390066.50 | backward_inner_microstep: 390051.53 | backward_inner: 390045.90 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.18 | step: 347.07 | _step_clipping: 0.13 | _step_step: 345.42 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.347 | iteration 8030/ 143000 | elapsed time per iteration (ms): 62639.7 | learning rate: 5.969E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.601405E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 19:17:20,343] [INFO] [logging.py:60:log_dist] [Rank 0] step=8040, skipped=2, lr=[0.0005968442969165432, 0.0005968442969165432], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8040 loss: 2.5858 iter time (s): 62.740 samples/sec: 16.321 %comms: 0.0028649171030992043 %optimizer_step 0.056057866017462696 %forward: 23.138550294784295 %backward: 62.15880294176468 [2025-03-30 19:17:20,344] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22745.54 | forward: 145170.35 | backward_microstep: 389987.97 | backward: 389981.87 | backward_inner_microstep: 389967.17 | backward_inner: 389961.72 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.97 | reduce_grads: 0.18 | step: 351.70 | _step_clipping: 0.12 | _step_step: 350.07 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.321 | iteration 8040/ 143000 | elapsed time per iteration (ms): 62740.1 | learning rate: 5.968E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.608740E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 19:27:42,450] [INFO] [logging.py:60:log_dist] [Rank 0] step=8050, skipped=2, lr=[0.0005968347553737727, 0.0005968347553737727], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8050 loss: 2.6263 iter time (s): 62.210 samples/sec: 16.460 %comms: 0.002866995988315776 %optimizer_step 0.0560708731648678 %forward: 23.33613900465253 %backward: 62.704785357931094 [2025-03-30 19:27:42,450] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17338.62 | forward: 145174.41 | backward_microstep: 390093.96 | backward: 390087.24 | backward_inner_microstep: 390072.31 | backward_inner: 390066.80 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.46 | reduce_tied_grads: 0.24 | comms: 17.84 | reduce_grads: 0.18 | step: 348.82 | _step_clipping: 0.11 | _step_step: 347.19 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.460 | iteration 8050/ 143000 | elapsed time per iteration (ms): 62210.6 | learning rate: 5.968E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.606349E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 19:38:00,179] [INFO] [logging.py:60:log_dist] [Rank 0] step=8060, skipped=2, lr=[0.0005968251995044258, 0.0005968251995044258], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8060 loss: 2.5931 iter time (s): 61.772 samples/sec: 16.577 %comms: 0.0028822138622315832 %optimizer_step 0.05650128777925957 %forward: 23.50468608717749 %backward: 63.148036174911816 [2025-03-30 19:38:00,180] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12963.82 | forward: 145194.24 | backward_microstep: 390087.01 | backward: 390080.99 | backward_inner_microstep: 390066.38 | backward_inner: 390061.08 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.45 | reduce_tied_grads: 0.23 | comms: 17.80 | reduce_grads: 0.18 | step: 349.02 | _step_clipping: 0.11 | _step_step: 347.42 | _step_zero_grad: 0.44 | _step_check_overflow: 0.52 samples/sec: 16.577 | iteration 8060/ 143000 | elapsed time per iteration (ms): 61773.0 | learning rate: 5.968E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.627250E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 19:48:25,670] [INFO] [logging.py:60:log_dist] [Rank 0] step=8070, skipped=2, lr=[0.0005968156293089638, 0.0005968156293089638], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8070 loss: 2.6148 iter time (s): 62.549 samples/sec: 16.371 %comms: 0.002948189213075274 %optimizer_step 0.05682391233400102 %forward: 23.218813016948843 %backward: 62.370836169400256 [2025-03-30 19:48:25,670] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20577.95 | forward: 145230.22 | backward_microstep: 390127.01 | backward: 390120.30 | backward_inner_microstep: 390105.35 | backward_inner: 390099.63 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.46 | reduce_tied_grads: 0.30 | comms: 18.44 | reduce_grads: 0.19 | step: 355.43 | _step_clipping: 0.13 | _step_step: 353.72 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.371 | iteration 8070/ 143000 | elapsed time per iteration (ms): 62549.0 | learning rate: 5.968E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.604332E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 19:58:47,452] [INFO] [logging.py:60:log_dist] [Rank 0] step=8080, skipped=2, lr=[0.0005968060447878484, 0.0005968060447878484], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8080 loss: 2.6092 iter time (s): 62.178 samples/sec: 16.469 %comms: 0.002875851261080142 %optimizer_step 0.056441648573376274 %forward: 23.362060313832753 %backward: 62.74460845279328 [2025-03-30 19:58:47,453] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16854.06 | forward: 145260.01 | backward_microstep: 390138.22 | backward: 390131.80 | backward_inner_microstep: 390116.80 | backward_inner: 390111.25 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.67 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 350.94 | _step_clipping: 0.12 | _step_step: 349.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.60 samples/sec: 16.469 | iteration 8080/ 143000 | elapsed time per iteration (ms): 62178.3 | learning rate: 5.968E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.601947E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 20:09:10,055] [INFO] [logging.py:60:log_dist] [Rank 0] step=8090, skipped=2, lr=[0.0005967964459415425, 0.0005967964459415425], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8090 loss: 2.5967 iter time (s): 62.260 samples/sec: 16.447 %comms: 0.0028770817656907253 %optimizer_step 0.05616572459997776 %forward: 23.32258559424232 %backward: 62.66108809277843 [2025-03-30 20:09:10,055] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17717.29 | forward: 145205.73 | backward_microstep: 390134.00 | backward: 390126.09 | backward_inner_microstep: 390109.82 | backward_inner: 390104.36 | backward_allreduce_microstep: 8.69 | backward_allreduce: 2.42 | reduce_tied_grads: 0.24 | comms: 17.91 | reduce_grads: 0.18 | step: 349.69 | _step_clipping: 0.11 | _step_step: 348.06 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.447 | iteration 8090/ 143000 | elapsed time per iteration (ms): 62260.2 | learning rate: 5.968E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.599524E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 20:19:35,119] [INFO] [logging.py:60:log_dist] [Rank 0] step=8100, skipped=2, lr=[0.0005967868327705092, 0.0005967868327705092], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8100 loss: 2.6064 iter time (s): 62.506 samples/sec: 16.382 %comms: 0.0034118833724072154 %optimizer_step 0.056327257822909337 %forward: 23.228959347139373 %backward: 62.402916891487315 [2025-03-30 20:19:35,120] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20295.41 | forward: 145194.83 | backward_microstep: 390062.95 | backward: 390055.40 | backward_inner_microstep: 390040.67 | backward_inner: 390035.30 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.46 | reduce_tied_grads: 3.72 | comms: 21.33 | reduce_grads: 0.17 | step: 352.08 | _step_clipping: 0.10 | _step_step: 350.49 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.382 | iteration 8100/ 143000 | elapsed time per iteration (ms): 62506.5 | learning rate: 5.968E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.597960E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 20:30:01,952] [INFO] [logging.py:60:log_dist] [Rank 0] step=8110, skipped=2, lr=[0.0005967772052752126, 0.0005967772052752126], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8110 loss: 2.6030 iter time (s): 62.683 samples/sec: 16.336 %comms: 0.002842335172021831 %optimizer_step 0.0548388344837422 %forward: 23.173654773822268 %backward: 62.221239179933505 [2025-03-30 20:30:01,953] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22010.17 | forward: 145258.88 | backward_microstep: 390026.10 | backward: 390019.95 | backward_inner_microstep: 390005.18 | backward_inner: 389999.62 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.44 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.17 | step: 343.74 | _step_clipping: 0.11 | _step_step: 342.15 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.336 | iteration 8110/ 143000 | elapsed time per iteration (ms): 62683.3 | learning rate: 5.968E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.608928E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 20:40:23,239] [INFO] [logging.py:60:log_dist] [Rank 0] step=8120, skipped=2, lr=[0.0005967675634561172, 0.0005967675634561172], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8120 loss: 2.6155 iter time (s): 62.128 samples/sec: 16.482 %comms: 0.0028777640846803054 %optimizer_step 0.056339398545732916 %forward: 23.36897254898 %backward: 62.78612642867253 [2025-03-30 20:40:23,239] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16478.89 | forward: 145187.05 | backward_microstep: 390084.90 | backward: 390078.44 | backward_inner_microstep: 390063.62 | backward_inner: 390057.95 | backward_allreduce_microstep: 7.10 | backward_allreduce: 2.44 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 350.03 | _step_clipping: 0.12 | _step_step: 348.41 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.482 | iteration 8120/ 143000 | elapsed time per iteration (ms): 62128.6 | learning rate: 5.968E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.592181E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 20:50:44,571] [INFO] [logging.py:60:log_dist] [Rank 0] step=8130, skipped=2, lr=[0.0005967579073136884, 0.0005967579073136884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8130 loss: 2.5906 iter time (s): 62.133 samples/sec: 16.481 %comms: 0.002864887036776982 %optimizer_step 0.05588264156406547 %forward: 23.367336803946102 %backward: 62.781605699231136 [2025-03-30 20:50:44,572] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16542.34 | forward: 145187.69 | backward_microstep: 390085.28 | backward: 390079.38 | backward_inner_microstep: 390063.38 | backward_inner: 390058.16 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.41 | reduce_tied_grads: 0.23 | comms: 17.80 | reduce_grads: 0.17 | step: 347.21 | _step_clipping: 0.11 | _step_step: 345.61 | _step_zero_grad: 0.45 | _step_check_overflow: 0.53 samples/sec: 16.481 | iteration 8130/ 143000 | elapsed time per iteration (ms): 62133.3 | learning rate: 5.968E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.590695E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 21:01:11,295] [INFO] [logging.py:60:log_dist] [Rank 0] step=8140, skipped=2, lr=[0.0005967482368483923, 0.0005967482368483923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8140 loss: 2.5945 iter time (s): 62.672 samples/sec: 16.339 %comms: 0.002860783917700262 %optimizer_step 0.05667193919794553 %forward: 23.186854858893348 %backward: 62.24298905592961 [2025-03-30 21:01:11,296] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21762.71 | forward: 145316.43 | backward_microstep: 390095.04 | backward: 390088.66 | backward_inner_microstep: 390073.82 | backward_inner: 390068.33 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.49 | reduce_tied_grads: 0.26 | comms: 17.93 | reduce_grads: 0.19 | step: 355.17 | _step_clipping: 0.13 | _step_step: 353.52 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.339 | iteration 8140/ 143000 | elapsed time per iteration (ms): 62672.4 | learning rate: 5.967E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.597680E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 21:11:32,543] [INFO] [logging.py:60:log_dist] [Rank 0] step=8150, skipped=2, lr=[0.0005967385520606957, 0.0005967385520606957], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8150 loss: 2.5885 iter time (s): 62.124 samples/sec: 16.483 %comms: 0.0028774442121193293 %optimizer_step 0.05618877883836257 %forward: 23.366763085436794 %backward: 62.78595923969424 [2025-03-30 21:11:32,544] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16479.33 | forward: 145164.29 | backward_microstep: 390058.89 | backward: 390053.14 | backward_inner_microstep: 390038.47 | backward_inner: 390033.02 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 349.07 | _step_clipping: 0.11 | _step_step: 347.37 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.483 | iteration 8150/ 143000 | elapsed time per iteration (ms): 62124.8 | learning rate: 5.967E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.598179E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 21:21:54,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=8160, skipped=2, lr=[0.000596728852951066, 0.000596728852951066], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8160 loss: 2.6120 iter time (s): 62.164 samples/sec: 16.473 %comms: 0.002866857392413007 %optimizer_step 0.05584881666028466 %forward: 23.370489911748063 %backward: 62.757269150028726 [2025-03-30 21:21:54,189] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16699.02 | forward: 145280.46 | backward_microstep: 390130.78 | backward: 390124.68 | backward_inner_microstep: 390109.77 | backward_inner: 390104.43 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.48 | reduce_tied_grads: 0.23 | comms: 17.82 | reduce_grads: 0.17 | step: 347.18 | _step_clipping: 0.12 | _step_step: 345.55 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.472 | iteration 8160/ 143000 | elapsed time per iteration (ms): 62164.6 | learning rate: 5.967E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.592131E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 21:32:12,284] [INFO] [logging.py:60:log_dist] [Rank 0] step=8170, skipped=2, lr=[0.0005967191395199712, 0.0005967191395199712], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8170 loss: 2.6204 iter time (s): 61.809 samples/sec: 16.567 %comms: 0.0028829807368816523 %optimizer_step 0.05638553320513103 %forward: 23.494288072621046 %backward: 63.10524692565303 [2025-03-30 21:32:12,285] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13283.35 | forward: 145215.76 | backward_microstep: 390053.19 | backward: 390046.98 | backward_inner_microstep: 390032.22 | backward_inner: 390025.14 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.43 | reduce_tied_grads: 0.23 | comms: 17.82 | reduce_grads: 0.18 | step: 348.51 | _step_clipping: 0.11 | _step_step: 346.69 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.567 | iteration 8170/ 143000 | elapsed time per iteration (ms): 61809.5 | learning rate: 5.967E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.601393E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 21:42:33,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=8180, skipped=2, lr=[0.0005967094117678802, 0.0005967094117678802], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8180 loss: 2.5820 iter time (s): 62.169 samples/sec: 16.471 %comms: 0.0028834085149652836 %optimizer_step 0.055970515885824856 %forward: 23.36080958385808 %backward: 62.75074573708783 [2025-03-30 21:42:33,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16798.95 | forward: 145230.82 | backward_microstep: 390118.48 | backward: 390112.43 | backward_inner_microstep: 390094.47 | backward_inner: 390087.37 | backward_allreduce_microstep: 8.77 | backward_allreduce: 4.14 | reduce_tied_grads: 0.25 | comms: 17.93 | reduce_grads: 0.18 | step: 347.96 | _step_clipping: 0.11 | _step_step: 346.31 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.471 | iteration 8180/ 143000 | elapsed time per iteration (ms): 62169.1 | learning rate: 5.967E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.591770E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 21:53:00,612] [INFO] [logging.py:60:log_dist] [Rank 0] step=8190, skipped=2, lr=[0.0005966996696952625, 0.0005966996696952625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8190 loss: 2.5918 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.002866205067660811 %optimizer_step 0.056449195794743584 %forward: 23.17513942147034 %backward: 62.24176773880122 [2025-03-30 21:53:00,613] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21832.80 | forward: 145222.76 | backward_microstep: 390032.57 | backward: 390026.62 | backward_inner_microstep: 390011.37 | backward_inner: 390005.85 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.51 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.19 | step: 353.73 | _step_clipping: 0.15 | _step_step: 351.94 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.341 | iteration 8190/ 143000 | elapsed time per iteration (ms): 62663.8 | learning rate: 5.967E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.592146E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 22:03:27,887] [INFO] [logging.py:60:log_dist] [Rank 0] step=8200, skipped=2, lr=[0.0005966899133025884, 0.0005966899133025884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8200 loss: 2.6004 iter time (s): 62.726 samples/sec: 16.325 %comms: 0.0028781801707726392 %optimizer_step 0.0566739191285482 %forward: 23.164011964923713 %backward: 62.17564005949415 [2025-03-30 22:03:27,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22394.64 | forward: 145299.36 | backward_microstep: 390011.56 | backward: 390005.02 | backward_inner_microstep: 389990.08 | backward_inner: 389984.55 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.45 | reduce_tied_grads: 0.27 | comms: 18.05 | reduce_grads: 0.19 | step: 355.49 | _step_clipping: 0.11 | _step_step: 353.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.325 | iteration 8200/ 143000 | elapsed time per iteration (ms): 62727.4 | learning rate: 5.967E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.592833E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 22:13:56,812] [INFO] [logging.py:60:log_dist] [Rank 0] step=8210, skipped=2, lr=[0.0005966801425903285, 0.0005966801425903285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8210 loss: 2.5884 iter time (s): 62.892 samples/sec: 16.282 %comms: 0.0028628667373981277 %optimizer_step 0.05531391834556428 %forward: 23.093533178675248 %backward: 62.022169040878886 [2025-03-30 22:13:56,812] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24073.60 | forward: 145239.78 | backward_microstep: 390076.36 | backward: 390069.63 | backward_inner_microstep: 390054.53 | backward_inner: 390048.84 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.19 | step: 347.88 | _step_clipping: 0.12 | _step_step: 346.29 | _step_zero_grad: 0.47 | _step_check_overflow: 0.45 samples/sec: 16.282 | iteration 8210/ 143000 | elapsed time per iteration (ms): 62892.5 | learning rate: 5.967E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.591600E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 22:24:18,195] [INFO] [logging.py:60:log_dist] [Rank 0] step=8220, skipped=2, lr=[0.0005966703575589547, 0.0005966703575589547], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8220 loss: 2.6095 iter time (s): 62.138 samples/sec: 16.479 %comms: 0.0028708692555155886 %optimizer_step 0.05557151386730378 %forward: 23.37147303590133 %backward: 62.77073355689049 [2025-03-30 22:24:18,195] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16576.90 | forward: 145225.23 | backward_microstep: 390051.73 | backward: 390043.64 | backward_inner_microstep: 390028.96 | backward_inner: 390021.54 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.43 | reduce_tied_grads: 0.27 | comms: 17.84 | reduce_grads: 0.19 | step: 345.31 | _step_clipping: 0.14 | _step_step: 343.67 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.479 | iteration 8220/ 143000 | elapsed time per iteration (ms): 62138.4 | learning rate: 5.967E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.600489E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 22:34:43,201] [INFO] [logging.py:60:log_dist] [Rank 0] step=8230, skipped=2, lr=[0.0005966605582089391, 0.0005966605582089391], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8230 loss: 2.5891 iter time (s): 62.500 samples/sec: 16.384 %comms: 0.0028479730407327355 %optimizer_step 0.055882649448239806 %forward: 23.22591609965416 %backward: 62.40164573855598 [2025-03-30 22:34:43,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20292.69 | forward: 145162.16 | backward_microstep: 390016.65 | backward: 390010.78 | backward_inner_microstep: 389994.36 | backward_inner: 389988.97 | backward_allreduce_microstep: 8.86 | backward_allreduce: 4.21 | reduce_tied_grads: 0.24 | comms: 17.80 | reduce_grads: 0.17 | step: 349.27 | _step_clipping: 0.11 | _step_step: 347.66 | _step_zero_grad: 0.45 | _step_check_overflow: 0.53 samples/sec: 16.384 | iteration 8230/ 143000 | elapsed time per iteration (ms): 62500.6 | learning rate: 5.967E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.592727E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 22:45:09,310] [INFO] [logging.py:60:log_dist] [Rank 0] step=8240, skipped=2, lr=[0.0005966507445407547, 0.0005966507445407547], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8240 loss: 2.6030 iter time (s): 62.610 samples/sec: 16.355 %comms: 0.002863671137794759 %optimizer_step 0.05582433707363087 %forward: 23.19939563919373 %backward: 62.299537984311335 [2025-03-30 22:45:09,311] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21211.31 | forward: 145252.30 | backward_microstep: 390066.87 | backward: 390059.78 | backward_inner_microstep: 390044.38 | backward_inner: 390038.54 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 349.52 | _step_clipping: 0.12 | _step_step: 347.89 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.355 | iteration 8240/ 143000 | elapsed time per iteration (ms): 62610.9 | learning rate: 5.967E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.596607E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 22:55:27,746] [INFO] [logging.py:60:log_dist] [Rank 0] step=8250, skipped=2, lr=[0.0005966409165548751, 0.0005966409165548751], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8250 loss: 2.5900 iter time (s): 61.843 samples/sec: 16.558 %comms: 0.0028998631556354116 %optimizer_step 0.05703152876302707 %forward: 23.486892741132745 %backward: 63.08286407061184 [2025-03-30 22:55:27,747] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13447.50 | forward: 145249.86 | backward_microstep: 390130.29 | backward: 390122.99 | backward_inner_microstep: 390106.28 | backward_inner: 390100.53 | backward_allreduce_microstep: 8.85 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.93 | reduce_grads: 0.18 | step: 352.70 | _step_clipping: 0.11 | _step_step: 351.00 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.558 | iteration 8250/ 143000 | elapsed time per iteration (ms): 61843.6 | learning rate: 5.966E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.596357E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 23:05:53,496] [INFO] [logging.py:60:log_dist] [Rank 0] step=8260, skipped=2, lr=[0.0005966310742517747, 0.0005966310742517747], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8260 loss: 2.5853 iter time (s): 62.574 samples/sec: 16.365 %comms: 0.0028635633228306433 %optimizer_step 0.05625204884410395 %forward: 23.205646542081297 %backward: 62.33570996502902 [2025-03-30 23:05:53,497] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20888.93 | forward: 145208.03 | backward_microstep: 390068.67 | backward: 390062.21 | backward_inner_microstep: 390047.27 | backward_inner: 390041.49 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.18 | step: 351.99 | _step_clipping: 0.13 | _step_step: 350.23 | _step_zero_grad: 0.46 | _step_check_overflow: 0.62 samples/sec: 16.364 | iteration 8260/ 143000 | elapsed time per iteration (ms): 62575.0 | learning rate: 5.966E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.603335E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 23:16:20,445] [INFO] [logging.py:60:log_dist] [Rank 0] step=8270, skipped=2, lr=[0.0005966212176319286, 0.0005966212176319286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8270 loss: 2.5866 iter time (s): 62.694 samples/sec: 16.333 %comms: 0.002889160780832663 %optimizer_step 0.05590120914221329 %forward: 23.166707598821194 %backward: 62.23113756434543 [2025-03-30 23:16:20,445] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21963.31 | forward: 145241.92 | backward_microstep: 390161.49 | backward: 390153.40 | backward_inner_microstep: 390138.05 | backward_inner: 390132.13 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.30 | comms: 18.11 | reduce_grads: 0.19 | step: 350.47 | _step_clipping: 0.14 | _step_step: 348.74 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.333 | iteration 8270/ 143000 | elapsed time per iteration (ms): 62694.8 | learning rate: 5.966E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.598638E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 23:26:41,941] [INFO] [logging.py:60:log_dist] [Rank 0] step=8280, skipped=2, lr=[0.0005966113466958124, 0.0005966113466958124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8280 loss: 2.5850 iter time (s): 62.149 samples/sec: 16.477 %comms: 0.002890680352326151 %optimizer_step 0.05634072269777036 %forward: 23.363724923766092 %backward: 62.78232922681279 [2025-03-30 23:26:41,942] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16483.88 | forward: 145203.44 | backward_microstep: 390195.21 | backward: 390186.49 | backward_inner_microstep: 390171.22 | backward_inner: 390165.36 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.18 | step: 350.15 | _step_clipping: 0.12 | _step_step: 348.47 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.476 | iteration 8280/ 143000 | elapsed time per iteration (ms): 62149.6 | learning rate: 5.966E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.590123E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 23:37:09,201] [INFO] [logging.py:60:log_dist] [Rank 0] step=8290, skipped=2, lr=[0.0005966014614439025, 0.0005966014614439025], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8290 loss: 2.5953 iter time (s): 62.725 samples/sec: 16.325 %comms: 0.002898937250926554 %optimizer_step 0.05581062366055524 %forward: 23.159651432496496 %backward: 62.20453122380324 [2025-03-30 23:37:09,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22195.94 | forward: 145269.91 | backward_microstep: 390188.60 | backward: 390180.59 | backward_inner_microstep: 390163.66 | backward_inner: 390157.92 | backward_allreduce_microstep: 8.99 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 18.18 | reduce_grads: 0.18 | step: 350.07 | _step_clipping: 0.12 | _step_step: 348.48 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.325 | iteration 8290/ 143000 | elapsed time per iteration (ms): 62726.0 | learning rate: 5.966E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.584081E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 23:47:32,102] [INFO] [logging.py:60:log_dist] [Rank 0] step=8300, skipped=2, lr=[0.0005965915618766762, 0.0005965915618766762], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8300 loss: 2.6097 iter time (s): 62.290 samples/sec: 16.439 %comms: 0.0028811349950847143 %optimizer_step 0.055750668343571216 %forward: 23.322930592255343 %backward: 62.635496666421396 [2025-03-30 23:47:32,103] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17851.12 | forward: 145277.66 | backward_microstep: 390161.49 | backward: 390154.16 | backward_inner_microstep: 390138.76 | backward_inner: 390132.94 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.57 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.18 | step: 347.27 | _step_clipping: 0.12 | _step_step: 345.58 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.439 | iteration 8300/ 143000 | elapsed time per iteration (ms): 62290.2 | learning rate: 5.966E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.597025E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-30 23:57:53,589] [INFO] [logging.py:60:log_dist] [Rank 0] step=8310, skipped=2, lr=[0.0005965816479946111, 0.0005965816479946111], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8310 loss: 2.5886 iter time (s): 62.148 samples/sec: 16.477 %comms: 0.0029025779319885874 %optimizer_step 0.056126775163718515 %forward: 23.37233289998947 %backward: 62.76909594535592 [2025-03-30 23:57:53,590] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16534.90 | forward: 145254.75 | backward_microstep: 390105.72 | backward: 390098.39 | backward_inner_microstep: 390083.23 | backward_inner: 390077.48 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.51 | reduce_tied_grads: 0.26 | comms: 18.04 | reduce_grads: 0.18 | step: 348.82 | _step_clipping: 0.12 | _step_step: 347.11 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.477 | iteration 8310/ 143000 | elapsed time per iteration (ms): 62148.7 | learning rate: 5.966E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.593185E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 00:08:20,650] [INFO] [logging.py:60:log_dist] [Rank 0] step=8320, skipped=2, lr=[0.0005965717197981857, 0.0005965717197981857], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8320 loss: 2.5956 iter time (s): 62.706 samples/sec: 16.330 %comms: 0.002884153692331731 %optimizer_step 0.05576456317485722 %forward: 23.174328602536757 %backward: 62.21549055735761 [2025-03-31 00:08:20,651] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21999.54 | forward: 145315.88 | backward_microstep: 390133.07 | backward: 390125.59 | backward_inner_microstep: 390108.74 | backward_inner: 390102.98 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.18 | step: 349.67 | _step_clipping: 0.12 | _step_step: 348.06 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.330 | iteration 8320/ 143000 | elapsed time per iteration (ms): 62706.1 | learning rate: 5.966E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.589927E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 00:18:42,901] [INFO] [logging.py:60:log_dist] [Rank 0] step=8330, skipped=2, lr=[0.0005965617772878794, 0.0005965617772878794], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8330 loss: 2.5633 iter time (s): 62.225 samples/sec: 16.457 %comms: 0.002888130253452157 %optimizer_step 0.061521768463264064 %forward: 23.346760908393577 %backward: 62.707665811784274 [2025-03-31 00:18:42,902] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17118.54 | forward: 145274.30 | backward_microstep: 390204.06 | backward: 390195.99 | backward_inner_microstep: 390180.72 | backward_inner: 390174.81 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.51 | reduce_tied_grads: 0.32 | comms: 17.97 | reduce_grads: 0.19 | step: 382.82 | _step_clipping: 0.14 | _step_step: 381.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.61 samples/sec: 16.456 | iteration 8330/ 143000 | elapsed time per iteration (ms): 62225.2 | learning rate: 5.966E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.579136E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 00:29:11,693] [INFO] [logging.py:60:log_dist] [Rank 0] step=8340, skipped=2, lr=[0.0005965518204641717, 0.0005965518204641717], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8340 loss: 2.5751 iter time (s): 62.879 samples/sec: 16.285 %comms: 0.002852859346844323 %optimizer_step 0.05600380656879701 %forward: 23.10610536455465 %backward: 62.045295838135836 [2025-03-31 00:29:11,693] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23738.34 | forward: 145287.92 | backward_microstep: 390141.89 | backward: 390132.03 | backward_inner_microstep: 390114.54 | backward_inner: 390108.56 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.19 | step: 352.14 | _step_clipping: 0.16 | _step_step: 350.47 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.285 | iteration 8340/ 143000 | elapsed time per iteration (ms): 62879.1 | learning rate: 5.966E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.576691E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 00:39:38,385] [INFO] [logging.py:60:log_dist] [Rank 0] step=8350, skipped=2, lr=[0.0005965418493275434, 0.0005965418493275434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8350 loss: 2.5742 iter time (s): 62.669 samples/sec: 16.340 %comms: 0.002851231427717339 %optimizer_step 0.05535398840910225 %forward: 23.182422093002117 %backward: 62.256578684850105 [2025-03-31 00:39:38,385] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21616.66 | forward: 145281.09 | backward_microstep: 390161.05 | backward: 390153.53 | backward_inner_microstep: 390136.46 | backward_inner: 390128.81 | backward_allreduce_microstep: 9.12 | backward_allreduce: 4.35 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.18 | step: 346.90 | _step_clipping: 0.14 | _step_step: 345.30 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.340 | iteration 8350/ 143000 | elapsed time per iteration (ms): 62669.2 | learning rate: 5.965E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.582061E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 00:50:06,158] [INFO] [logging.py:60:log_dist] [Rank 0] step=8360, skipped=2, lr=[0.0005965318638784758, 0.0005965318638784758], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8360 loss: 2.6139 iter time (s): 62.777 samples/sec: 16.312 %comms: 0.0028635973350040022 %optimizer_step 0.05659391217459163 %forward: 23.149550133852877 %backward: 62.15942927842764 [2025-03-31 00:50:06,159] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22599.47 | forward: 145325.57 | backward_microstep: 390227.08 | backward: 390217.29 | backward_inner_microstep: 390202.00 | backward_inner: 390196.06 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.48 | reduce_tied_grads: 2.03 | comms: 17.98 | reduce_grads: 0.18 | step: 355.28 | _step_clipping: 0.13 | _step_step: 353.57 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.312 | iteration 8360/ 143000 | elapsed time per iteration (ms): 62777.4 | learning rate: 5.965E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.596607E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 01:00:32,398] [INFO] [logging.py:60:log_dist] [Rank 0] step=8370, skipped=2, lr=[0.0005965218641174507, 0.0005965218641174507], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8370 loss: 2.5647 iter time (s): 62.623 samples/sec: 16.352 %comms: 0.002846665825719198 %optimizer_step 0.05555934367753036 %forward: 23.19216659088775 %backward: 62.28373745499914 [2025-03-31 01:00:32,399] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21341.55 | forward: 145237.28 | backward_microstep: 390048.50 | backward: 390042.08 | backward_inner_microstep: 390027.30 | backward_inner: 390021.59 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 347.93 | _step_clipping: 0.12 | _step_step: 346.36 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.352 | iteration 8370/ 143000 | elapsed time per iteration (ms): 62624.0 | learning rate: 5.965E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.580322E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 01:10:54,758] [INFO] [logging.py:60:log_dist] [Rank 0] step=8380, skipped=2, lr=[0.0005965118500449509, 0.0005965118500449509], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8380 loss: 2.5827 iter time (s): 62.235 samples/sec: 16.454 %comms: 0.002897091362450477 %optimizer_step 0.05723827965999807 %forward: 23.331939205803494 %backward: 62.67714495546102 [2025-03-31 01:10:54,758] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17442.55 | forward: 145207.28 | backward_microstep: 390080.70 | backward: 390073.78 | backward_inner_microstep: 390058.75 | backward_inner: 390053.13 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.22 | step: 356.22 | _step_clipping: 0.14 | _step_step: 354.44 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.454 | iteration 8380/ 143000 | elapsed time per iteration (ms): 62236.0 | learning rate: 5.965E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.580298E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 01:21:20,705] [INFO] [logging.py:60:log_dist] [Rank 0] step=8390, skipped=2, lr=[0.0005965018216614595, 0.0005965018216614595], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8390 loss: 2.5754 iter time (s): 62.594 samples/sec: 16.359 %comms: 0.0028605270429204526 %optimizer_step 0.05555059720287679 %forward: 23.20978092492904 %backward: 62.31897011296854 [2025-03-31 01:21:20,706] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20963.27 | forward: 145279.73 | backward_microstep: 390087.90 | backward: 390080.50 | backward_inner_microstep: 390063.23 | backward_inner: 390055.13 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.49 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 347.71 | _step_clipping: 0.12 | _step_step: 346.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.359 | iteration 8390/ 143000 | elapsed time per iteration (ms): 62594.7 | learning rate: 5.965E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.581155E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 01:31:38,947] [INFO] [logging.py:60:log_dist] [Rank 0] step=8400, skipped=2, lr=[0.0005964917789674606, 0.0005964917789674606], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8400 loss: 2.5630 iter time (s): 61.824 samples/sec: 16.563 %comms: 0.0028927047681033087 %optimizer_step 0.05682874894993005 %forward: 23.485387149193816 %backward: 63.096551656201015 [2025-03-31 01:31:38,948] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13319.33 | forward: 145195.41 | backward_microstep: 390095.82 | backward: 390086.37 | backward_inner_microstep: 390071.28 | backward_inner: 390065.54 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 351.34 | _step_clipping: 0.12 | _step_step: 349.65 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.563 | iteration 8400/ 143000 | elapsed time per iteration (ms): 61824.3 | learning rate: 5.965E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.577553E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 01:42:05,737] [INFO] [logging.py:60:log_dist] [Rank 0] step=8410, skipped=2, lr=[0.0005964817219634391, 0.0005964817219634391], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8410 loss: 2.5773 iter time (s): 62.678 samples/sec: 16.337 %comms: 0.0028588510665593088 %optimizer_step 0.056254210057788766 %forward: 23.201464516554083 %backward: 62.24654421128692 [2025-03-31 01:42:05,738] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21575.04 | forward: 145423.10 | backward_microstep: 390159.60 | backward: 390151.47 | backward_inner_microstep: 390136.01 | backward_inner: 390130.09 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.48 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.18 | step: 352.59 | _step_clipping: 0.12 | _step_step: 350.99 | _step_zero_grad: 0.45 | _step_check_overflow: 0.48 samples/sec: 16.337 | iteration 8410/ 143000 | elapsed time per iteration (ms): 62679.0 | learning rate: 5.965E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.588825E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 01:52:24,064] [INFO] [logging.py:60:log_dist] [Rank 0] step=8420, skipped=2, lr=[0.0005964716506498801, 0.0005964716506498801], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8420 loss: 2.5783 iter time (s): 61.832 samples/sec: 16.561 %comms: 0.00291367107815133 %optimizer_step 0.0570669495080232 %forward: 23.489996836887 %backward: 63.09115571466245 [2025-03-31 01:52:24,065] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13348.16 | forward: 145243.76 | backward_microstep: 390113.71 | backward: 390106.33 | backward_inner_microstep: 390091.50 | backward_inner: 390085.85 | backward_allreduce_microstep: 7.06 | backward_allreduce: 2.45 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.19 | step: 352.86 | _step_clipping: 0.14 | _step_step: 351.10 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.561 | iteration 8420/ 143000 | elapsed time per iteration (ms): 61832.7 | learning rate: 5.965E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.582859E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 02:02:42,454] [INFO] [logging.py:60:log_dist] [Rank 0] step=8430, skipped=2, lr=[0.0005964615650272698, 0.0005964615650272698], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8430 loss: 2.5820 iter time (s): 61.838 samples/sec: 16.559 %comms: 0.0029116771390135564 %optimizer_step 0.05745677425114366 %forward: 23.483437318434525 %backward: 63.09005503976993 [2025-03-31 02:02:42,455] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13391.21 | forward: 145218.02 | backward_microstep: 390147.26 | backward: 390139.36 | backward_inner_microstep: 390124.22 | backward_inner: 390118.39 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.19 | step: 355.30 | _step_clipping: 0.11 | _step_step: 353.58 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.559 | iteration 8430/ 143000 | elapsed time per iteration (ms): 61839.0 | learning rate: 5.965E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.580498E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 02:13:04,660] [INFO] [logging.py:60:log_dist] [Rank 0] step=8440, skipped=2, lr=[0.0005964514650960951, 0.0005964514650960951], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8440 loss: 2.6016 iter time (s): 62.220 samples/sec: 16.458 %comms: 0.002875238334879882 %optimizer_step 0.056019038491316606 %forward: 23.340757128335444 %backward: 62.68924675216914 [2025-03-31 02:13:04,660] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17317.15 | forward: 145226.23 | backward_microstep: 390059.72 | backward: 390052.61 | backward_inner_microstep: 390037.60 | backward_inner: 390031.93 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.89 | reduce_grads: 0.19 | step: 348.55 | _step_clipping: 0.11 | _step_step: 346.96 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.458 | iteration 8440/ 143000 | elapsed time per iteration (ms): 62220.5 | learning rate: 5.965E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.588327E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 02:23:26,892] [INFO] [logging.py:60:log_dist] [Rank 0] step=8450, skipped=2, lr=[0.0005964413508568433, 0.0005964413508568433], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8450 loss: 2.5714 iter time (s): 62.223 samples/sec: 16.457 %comms: 0.0028733933396618387 %optimizer_step 0.05580019512605312 %forward: 23.330150412550584 %backward: 62.68020269455356 [2025-03-31 02:23:26,892] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17447.64 | forward: 145166.33 | backward_microstep: 390018.84 | backward: 390012.71 | backward_inner_microstep: 389996.19 | backward_inner: 389987.09 | backward_allreduce_microstep: 7.13 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 347.20 | _step_clipping: 0.12 | _step_step: 345.35 | _step_zero_grad: 0.46 | _step_check_overflow: 0.73 samples/sec: 16.457 | iteration 8450/ 143000 | elapsed time per iteration (ms): 62223.2 | learning rate: 5.964E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.577014E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 02:33:52,966] [INFO] [logging.py:60:log_dist] [Rank 0] step=8460, skipped=2, lr=[0.0005964312223100027, 0.0005964312223100027], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8460 loss: 2.5900 iter time (s): 62.607 samples/sec: 16.356 %comms: 0.0028582717698212357 %optimizer_step 0.056116584923652166 %forward: 23.190274324198274 %backward: 62.30548411165625 [2025-03-31 02:33:52,966] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21197.97 | forward: 145187.05 | backward_microstep: 390081.65 | backward: 390075.13 | backward_inner_microstep: 390058.62 | backward_inner: 390053.05 | backward_allreduce_microstep: 7.12 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.17 | step: 351.33 | _step_clipping: 0.12 | _step_step: 349.65 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.356 | iteration 8460/ 143000 | elapsed time per iteration (ms): 62607.4 | learning rate: 5.964E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.575890E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 02:44:14,584] [INFO] [logging.py:60:log_dist] [Rank 0] step=8470, skipped=2, lr=[0.0005964210794560619, 0.0005964210794560619], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8470 loss: 2.5741 iter time (s): 62.161 samples/sec: 16.473 %comms: 0.004166558552478155 %optimizer_step 0.05834347959366716 %forward: 23.3673284870252 %backward: 62.75778556389493 [2025-03-31 02:44:14,585] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16598.96 | forward: 145254.45 | backward_microstep: 390118.62 | backward: 390110.82 | backward_inner_microstep: 390095.63 | backward_inner: 390089.77 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.31 | comms: 25.90 | reduce_grads: 0.18 | step: 362.67 | _step_clipping: 0.14 | _step_step: 360.90 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.473 | iteration 8470/ 143000 | elapsed time per iteration (ms): 62161.9 | learning rate: 5.964E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.580057E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 02:54:32,967] [INFO] [logging.py:60:log_dist] [Rank 0] step=8480, skipped=2, lr=[0.0005964109222955108, 0.0005964109222955108], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8480 loss: 2.5742 iter time (s): 61.838 samples/sec: 16.559 %comms: 0.002896716118038134 %optimizer_step 0.05786985750139225 %forward: 23.49100358380279 %backward: 63.089439637012255 [2025-03-31 02:54:32,967] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13322.60 | forward: 145262.96 | backward_microstep: 390141.90 | backward: 390130.58 | backward_inner_microstep: 390115.36 | backward_inner: 390109.53 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 357.85 | _step_clipping: 0.10 | _step_step: 356.19 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.559 | iteration 8480/ 143000 | elapsed time per iteration (ms): 61838.2 | learning rate: 5.964E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.572635E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 03:04:51,400] [INFO] [logging.py:60:log_dist] [Rank 0] step=8490, skipped=2, lr=[0.0005964007508288394, 0.0005964007508288394], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8490 loss: 2.5957 iter time (s): 61.843 samples/sec: 16.558 %comms: 0.002901374141420347 %optimizer_step 0.056941115639123616 %forward: 23.49524745408621 %backward: 63.08964797887153 [2025-03-31 03:04:51,400] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13303.01 | forward: 145301.15 | backward_microstep: 390171.90 | backward: 390163.95 | backward_inner_microstep: 390148.55 | backward_inner: 390142.71 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.18 | step: 352.14 | _step_clipping: 0.11 | _step_step: 350.45 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.558 | iteration 8490/ 143000 | elapsed time per iteration (ms): 61843.3 | learning rate: 5.964E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.579941E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 03:15:19,234] [INFO] [logging.py:60:log_dist] [Rank 0] step=8500, skipped=2, lr=[0.0005963905650565386, 0.0005963905650565386], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8500 loss: 2.5873 iter time (s): 62.783 samples/sec: 16.310 %comms: 0.0028561445853050624 %optimizer_step 0.05565234427392572 %forward: 23.143089698626728 %backward: 62.13938287554214 [2025-03-31 03:15:19,235] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22767.84 | forward: 145298.99 | backward_microstep: 390136.15 | backward: 390128.97 | backward_inner_microstep: 390113.85 | backward_inner: 390108.09 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 349.40 | _step_clipping: 0.12 | _step_step: 347.77 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.310 | iteration 8500/ 143000 | elapsed time per iteration (ms): 62783.4 | learning rate: 5.964E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.586093E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 03:25:45,797] [INFO] [logging.py:60:log_dist] [Rank 0] step=8510, skipped=2, lr=[0.0005963803649791001, 0.0005963803649791001], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8510 loss: 2.5692 iter time (s): 62.656 samples/sec: 16.343 %comms: 0.0028454216101025014 %optimizer_step 0.05564723352003233 %forward: 23.170681074362356 %backward: 62.24940009490687 [2025-03-31 03:25:45,798] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21746.52 | forward: 145177.83 | backward_microstep: 390034.95 | backward: 390028.80 | backward_inner_microstep: 390014.05 | backward_inner: 390008.62 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.24 | comms: 17.83 | reduce_grads: 0.17 | step: 348.66 | _step_clipping: 0.12 | _step_step: 347.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.343 | iteration 8510/ 143000 | elapsed time per iteration (ms): 62656.3 | learning rate: 5.964E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.577501E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 03:36:11,721] [INFO] [logging.py:60:log_dist] [Rank 0] step=8520, skipped=2, lr=[0.0005963701505970162, 0.0005963701505970162], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8520 loss: 2.5611 iter time (s): 62.592 samples/sec: 16.360 %comms: 0.0028817372919738276 %optimizer_step 0.056236767671068645 %forward: 23.884547778126517 %backward: 62.32094540226755 [2025-03-31 03:36:11,721] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16696.98 | forward: 149497.74 | backward_microstep: 390085.29 | backward: 390078.16 | backward_inner_microstep: 390063.15 | backward_inner: 390055.68 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.28 | comms: 18.04 | reduce_grads: 0.19 | step: 352.00 | _step_clipping: 0.12 | _step_step: 350.30 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.360 | iteration 8520/ 143000 | elapsed time per iteration (ms): 62592.4 | learning rate: 5.964E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.569954E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 03:46:34,457] [INFO] [logging.py:60:log_dist] [Rank 0] step=8530, skipped=2, lr=[0.0005963599219107799, 0.0005963599219107799], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8530 loss: 2.5867 iter time (s): 62.273 samples/sec: 16.444 %comms: 0.0028760417627129945 %optimizer_step 0.05640261704654479 %forward: 23.315199411444855 %backward: 62.63704401692597 [2025-03-31 03:46:34,458] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17851.24 | forward: 145190.97 | backward_microstep: 390069.10 | backward: 390060.29 | backward_inner_microstep: 390045.47 | backward_inner: 390038.12 | backward_allreduce_microstep: 7.10 | backward_allreduce: 2.42 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 351.24 | _step_clipping: 0.13 | _step_step: 349.58 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.444 | iteration 8530/ 143000 | elapsed time per iteration (ms): 62273.6 | learning rate: 5.964E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.579513E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 03:56:52,496] [INFO] [logging.py:60:log_dist] [Rank 0] step=8540, skipped=2, lr=[0.0005963496789208847, 0.0005963496789208847], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8540 loss: 2.5939 iter time (s): 61.803 samples/sec: 16.569 %comms: 0.002892579116591887 %optimizer_step 0.056626184561192235 %forward: 23.49235577061392 %backward: 63.12536616232688 [2025-03-31 03:56:52,496] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13045.61 | forward: 145190.58 | backward_microstep: 390143.57 | backward: 390135.78 | backward_inner_microstep: 390120.66 | backward_inner: 390113.27 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.88 | reduce_grads: 0.19 | step: 349.97 | _step_clipping: 0.11 | _step_step: 348.28 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.569 | iteration 8540/ 143000 | elapsed time per iteration (ms): 61803.8 | learning rate: 5.963E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.581955E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 04:07:20,494] [INFO] [logging.py:60:log_dist] [Rank 0] step=8550, skipped=2, lr=[0.0005963394216278253, 0.0005963394216278253], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8550 loss: 2.5973 iter time (s): 62.799 samples/sec: 16.306 %comms: 0.0028769630664430682 %optimizer_step 0.05578268856914973 %forward: 23.14045407532642 %backward: 62.12739752444555 [2025-03-31 04:07:20,495] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22851.76 | forward: 145320.39 | backward_microstep: 390163.35 | backward: 390155.60 | backward_inner_microstep: 390138.47 | backward_inner: 390132.58 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 18.07 | reduce_grads: 0.18 | step: 350.31 | _step_clipping: 0.12 | _step_step: 348.61 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.306 | iteration 8550/ 143000 | elapsed time per iteration (ms): 62799.8 | learning rate: 5.963E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.581627E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 04:17:38,708] [INFO] [logging.py:60:log_dist] [Rank 0] step=8560, skipped=2, lr=[0.0005963291500320965, 0.0005963291500320965], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8560 loss: 2.5798 iter time (s): 61.821 samples/sec: 16.564 %comms: 0.0029281645895160285 %optimizer_step 0.05818430338817671 %forward: 23.492957449270556 %backward: 63.110683691656654 [2025-03-31 04:17:38,709] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13163.52 | forward: 145235.51 | backward_microstep: 390163.06 | backward: 390155.75 | backward_inner_microstep: 390140.91 | backward_inner: 390135.23 | backward_allreduce_microstep: 7.08 | backward_allreduce: 2.43 | reduce_tied_grads: 0.28 | comms: 18.10 | reduce_grads: 0.18 | step: 359.70 | _step_clipping: 0.13 | _step_step: 358.01 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.564 | iteration 8560/ 143000 | elapsed time per iteration (ms): 61821.4 | learning rate: 5.963E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.577377E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 04:27:57,568] [INFO] [logging.py:60:log_dist] [Rank 0] step=8570, skipped=2, lr=[0.0005963188641341942, 0.0005963188641341942], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8570 loss: 2.5895 iter time (s): 61.885 samples/sec: 16.547 %comms: 0.00289767903221715 %optimizer_step 0.05660614101672322 %forward: 23.472312169462466 %backward: 63.04770500607597 [2025-03-31 04:27:57,568] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13747.47 | forward: 145259.44 | backward_microstep: 390182.85 | backward: 390173.51 | backward_inner_microstep: 390158.34 | backward_inner: 390152.45 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.93 | reduce_grads: 0.18 | step: 350.31 | _step_clipping: 0.13 | _step_step: 348.67 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.547 | iteration 8570/ 143000 | elapsed time per iteration (ms): 61886.0 | learning rate: 5.963E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.575706E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 04:38:24,285] [INFO] [logging.py:60:log_dist] [Rank 0] step=8580, skipped=2, lr=[0.0005963085639346146, 0.0005963085639346146], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8580 loss: 2.5653 iter time (s): 62.671 samples/sec: 16.339 %comms: 0.002848298565023181 %optimizer_step 0.056154937961684526 %forward: 23.18046227344362 %backward: 62.25060123281595 [2025-03-31 04:38:24,286] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21643.33 | forward: 145274.81 | backward_microstep: 390139.08 | backward: 390132.17 | backward_inner_microstep: 390108.32 | backward_inner: 390100.96 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.17 | step: 351.93 | _step_clipping: 0.11 | _step_step: 350.34 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.339 | iteration 8580/ 143000 | elapsed time per iteration (ms): 62671.8 | learning rate: 5.963E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.570233E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 04:48:52,226] [INFO] [logging.py:60:log_dist] [Rank 0] step=8590, skipped=2, lr=[0.0005962982494338553, 0.0005962982494338553], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8590 loss: 2.5941 iter time (s): 62.794 samples/sec: 16.307 %comms: 0.0028342468083041405 %optimizer_step 0.05519504581874327 %forward: 23.11988687710623 %backward: 62.10783434421501 [2025-03-31 04:48:52,226] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23158.33 | forward: 145177.89 | backward_microstep: 390002.65 | backward: 389996.92 | backward_inner_microstep: 389982.53 | backward_inner: 389977.26 | backward_allreduce_microstep: 7.01 | backward_allreduce: 2.42 | reduce_tied_grads: 0.23 | comms: 17.80 | reduce_grads: 0.17 | step: 346.59 | _step_clipping: 0.11 | _step_step: 345.02 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.307 | iteration 8590/ 143000 | elapsed time per iteration (ms): 62794.0 | learning rate: 5.963E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.584863E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 04:59:15,362] [INFO] [logging.py:60:log_dist] [Rank 0] step=8600, skipped=2, lr=[0.0005962879206324137, 0.0005962879206324137], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8600 loss: 2.5792 iter time (s): 62.313 samples/sec: 16.433 %comms: 0.002903847808209574 %optimizer_step 0.05622349815608438 %forward: 23.30922182480207 %backward: 62.61797798559287 [2025-03-31 04:59:15,363] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18039.94 | forward: 145247.00 | backward_microstep: 390198.77 | backward: 390192.08 | backward_inner_microstep: 390174.98 | backward_inner: 390169.20 | backward_allreduce_microstep: 9.19 | backward_allreduce: 2.54 | reduce_tied_grads: 0.27 | comms: 18.09 | reduce_grads: 0.18 | step: 350.35 | _step_clipping: 0.13 | _step_step: 348.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.433 | iteration 8600/ 143000 | elapsed time per iteration (ms): 62313.6 | learning rate: 5.963E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.584149E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 05:09:41,669] [INFO] [logging.py:60:log_dist] [Rank 0] step=8610, skipped=2, lr=[0.0005962775775307884, 0.0005962775775307884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8610 loss: 2.5666 iter time (s): 62.630 samples/sec: 16.350 %comms: 0.0028511193356859058 %optimizer_step 0.05559945371798265 %forward: 23.18430963187658 %backward: 62.2812312181524 [2025-03-31 05:09:41,669] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21393.75 | forward: 145203.64 | backward_microstep: 390074.39 | backward: 390068.17 | backward_inner_microstep: 390049.99 | backward_inner: 390044.43 | backward_allreduce_microstep: 10.54 | backward_allreduce: 4.24 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.17 | step: 348.22 | _step_clipping: 0.12 | _step_step: 346.57 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.350 | iteration 8610/ 143000 | elapsed time per iteration (ms): 62630.7 | learning rate: 5.963E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.579646E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 05:20:09,461] [INFO] [logging.py:60:log_dist] [Rank 0] step=8620, skipped=2, lr=[0.0005962672201294786, 0.0005962672201294786], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8620 loss: 2.5626 iter time (s): 62.779 samples/sec: 16.311 %comms: 0.002853831000413554 %optimizer_step 0.05670610999110381 %forward: 23.129683880986274 %backward: 62.12702739290275 [2025-03-31 05:20:09,461] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22939.68 | forward: 145205.02 | backward_microstep: 390032.57 | backward: 390025.06 | backward_inner_microstep: 390005.11 | backward_inner: 389999.49 | backward_allreduce_microstep: 10.54 | backward_allreduce: 4.12 | reduce_tied_grads: 0.24 | comms: 17.92 | reduce_grads: 0.17 | step: 355.99 | _step_clipping: 0.12 | _step_step: 354.41 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.311 | iteration 8620/ 143000 | elapsed time per iteration (ms): 62779.2 | learning rate: 5.963E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.565205E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 05:30:31,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=8630, skipped=2, lr=[0.0005962568484289844, 0.0005962568484289844], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8630 loss: 2.5529 iter time (s): 62.249 samples/sec: 16.450 %comms: 0.002862192015046 %optimizer_step 0.05588875253723618 %forward: 23.317547794177145 %backward: 62.65178828101733 [2025-03-31 05:30:31,954] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17723.33 | forward: 145148.78 | backward_microstep: 390005.22 | backward: 389999.43 | backward_inner_microstep: 389983.19 | backward_inner: 389977.80 | backward_allreduce_microstep: 8.74 | backward_allreduce: 2.42 | reduce_tied_grads: 0.23 | comms: 17.82 | reduce_grads: 0.17 | step: 347.90 | _step_clipping: 0.12 | _step_step: 346.24 | _step_zero_grad: 0.45 | _step_check_overflow: 0.57 samples/sec: 16.450 | iteration 8630/ 143000 | elapsed time per iteration (ms): 62249.3 | learning rate: 5.963E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.568480E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 05:40:57,536] [INFO] [logging.py:60:log_dist] [Rank 0] step=8640, skipped=2, lr=[0.0005962464624298062, 0.0005962464624298062], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8640 loss: 2.5686 iter time (s): 62.558 samples/sec: 16.369 %comms: 0.0028673389072958523 %optimizer_step 0.05617190665664731 %forward: 23.215454834981337 %backward: 62.353178784964214 [2025-03-31 05:40:57,537] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20618.22 | forward: 145230.62 | backward_microstep: 390074.89 | backward: 390067.34 | backward_inner_microstep: 390048.93 | backward_inner: 390043.21 | backward_allreduce_microstep: 8.85 | backward_allreduce: 2.50 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.19 | step: 351.40 | _step_clipping: 0.14 | _step_step: 349.69 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.369 | iteration 8640/ 143000 | elapsed time per iteration (ms): 62558.3 | learning rate: 5.962E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.575966E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 05:51:19,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=8650, skipped=2, lr=[0.0005962360621324452, 0.0005962360621324452], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8650 loss: 2.5778 iter time (s): 62.184 samples/sec: 16.467 %comms: 0.002884804200418464 %optimizer_step 0.05645384033678652 %forward: 23.352705171260457 %backward: 62.73143545479496 [2025-03-31 05:51:19,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16891.42 | forward: 145216.35 | backward_microstep: 390095.69 | backward: 390088.85 | backward_inner_microstep: 390073.61 | backward_inner: 390067.84 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.18 | step: 351.05 | _step_clipping: 0.14 | _step_step: 349.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.467 | iteration 8650/ 143000 | elapsed time per iteration (ms): 62184.5 | learning rate: 5.962E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.566212E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 06:01:45,375] [INFO] [logging.py:60:log_dist] [Rank 0] step=8660, skipped=2, lr=[0.0005962256475374037, 0.0005962256475374037], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8660 loss: 2.5663 iter time (s): 62.599 samples/sec: 16.358 %comms: 0.0028576081440414517 %optimizer_step 0.05481173424055198 %forward: 23.191769807312856 %backward: 62.30253703584355 [2025-03-31 06:01:45,376] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21197.79 | forward: 145177.89 | backward_microstep: 390013.00 | backward: 390006.92 | backward_inner_microstep: 389990.52 | backward_inner: 389985.02 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.46 | reduce_tied_grads: 0.25 | comms: 17.89 | reduce_grads: 0.18 | step: 343.12 | _step_clipping: 0.11 | _step_step: 341.43 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.358 | iteration 8660/ 143000 | elapsed time per iteration (ms): 62599.4 | learning rate: 5.962E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.578447E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 06:12:13,050] [INFO] [logging.py:60:log_dist] [Rank 0] step=8670, skipped=2, lr=[0.000596215218645184, 0.000596215218645184], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8670 loss: 2.5671 iter time (s): 62.767 samples/sec: 16.314 %comms: 0.002851441419895243 %optimizer_step 0.05638555615878931 %forward: 23.12684719127558 %backward: 62.142210974007284 [2025-03-31 06:12:13,050] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22848.22 | forward: 145159.99 | backward_microstep: 390053.41 | backward: 390047.22 | backward_inner_microstep: 390032.32 | backward_inner: 390026.76 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.43 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 353.91 | _step_clipping: 0.12 | _step_step: 352.34 | _step_zero_grad: 0.45 | _step_check_overflow: 0.46 samples/sec: 16.314 | iteration 8670/ 143000 | elapsed time per iteration (ms): 62767.4 | learning rate: 5.962E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.565842E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 06:22:36,057] [INFO] [logging.py:60:log_dist] [Rank 0] step=8680, skipped=2, lr=[0.0005962047754562896, 0.0005962047754562896], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8680 loss: 2.5595 iter time (s): 62.300 samples/sec: 16.437 %comms: 0.0028702001839411035 %optimizer_step 0.056275517505167995 %forward: 23.30296850078985 %backward: 62.61195202423194 [2025-03-31 06:22:36,057] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18103.41 | forward: 145177.87 | backward_microstep: 390080.95 | backward: 390073.47 | backward_inner_microstep: 390058.20 | backward_inner: 390052.45 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.19 | step: 350.60 | _step_clipping: 0.12 | _step_step: 348.90 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.436 | iteration 8680/ 143000 | elapsed time per iteration (ms): 62300.7 | learning rate: 5.962E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.563926E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 06:33:01,436] [INFO] [logging.py:60:log_dist] [Rank 0] step=8690, skipped=2, lr=[0.0005961943179712245, 0.0005961943179712245], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8690 loss: 2.5782 iter time (s): 62.537 samples/sec: 16.374 %comms: 0.0028578637731504946 %optimizer_step 0.05585823261188092 %forward: 23.21551930543155 %backward: 62.36832379417061 [2025-03-31 06:33:01,436] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20510.16 | forward: 145183.79 | backward_microstep: 390041.43 | backward: 390035.21 | backward_inner_microstep: 390020.26 | backward_inner: 390014.60 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.18 | step: 349.32 | _step_clipping: 0.13 | _step_step: 347.62 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.374 | iteration 8690/ 143000 | elapsed time per iteration (ms): 62537.9 | learning rate: 5.962E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.577363E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 06:43:22,907] [INFO] [logging.py:60:log_dist] [Rank 0] step=8700, skipped=2, lr=[0.0005961838461904933, 0.0005961838461904933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8700 loss: 2.5668 iter time (s): 62.147 samples/sec: 16.477 %comms: 0.0028712283913563713 %optimizer_step 0.05655075407188405 %forward: 23.362835465289404 %backward: 62.76466377891368 [2025-03-31 06:43:22,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16549.17 | forward: 145192.20 | backward_microstep: 390067.49 | backward: 390061.37 | backward_inner_microstep: 390046.68 | backward_inner: 390041.24 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.84 | reduce_grads: 0.18 | step: 351.44 | _step_clipping: 0.10 | _step_step: 349.84 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.477 | iteration 8700/ 143000 | elapsed time per iteration (ms): 62147.2 | learning rate: 5.962E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.566706E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 06:53:49,775] [INFO] [logging.py:60:log_dist] [Rank 0] step=8710, skipped=2, lr=[0.0005961733601146018, 0.0005961733601146018], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8710 loss: 2.5794 iter time (s): 62.686 samples/sec: 16.335 %comms: 0.0028866044284943616 %optimizer_step 0.0558302142359716 %forward: 23.173655635776004 %backward: 62.229081937619945 [2025-03-31 06:53:49,775] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21840.74 | forward: 145266.76 | backward_microstep: 390097.55 | backward: 390090.26 | backward_inner_microstep: 390075.15 | backward_inner: 390069.32 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.18 | step: 349.98 | _step_clipping: 0.12 | _step_step: 348.37 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.335 | iteration 8710/ 143000 | elapsed time per iteration (ms): 62686.7 | learning rate: 5.962E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.573032E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 07:04:11,452] [INFO] [logging.py:60:log_dist] [Rank 0] step=8720, skipped=2, lr=[0.0005961628597440558, 0.0005961628597440558], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8720 loss: 2.5631 iter time (s): 62.167 samples/sec: 16.472 %comms: 0.0028713916673256965 %optimizer_step 0.05793084750564956 %forward: 23.357873680882232 %backward: 62.735716913430736 [2025-03-31 07:04:11,452] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16818.82 | forward: 145209.36 | backward_microstep: 390016.28 | backward: 390010.37 | backward_inner_microstep: 389995.97 | backward_inner: 389990.51 | backward_allreduce_microstep: 6.96 | backward_allreduce: 2.38 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.17 | step: 360.14 | _step_clipping: 0.11 | _step_step: 358.51 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.472 | iteration 8720/ 143000 | elapsed time per iteration (ms): 62167.7 | learning rate: 5.962E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.576528E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 07:14:36,541] [INFO] [logging.py:60:log_dist] [Rank 0] step=8730, skipped=2, lr=[0.0005961523450793621, 0.0005961523450793621], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8730 loss: 2.5741 iter time (s): 62.508 samples/sec: 16.382 %comms: 0.002858500723449673 %optimizer_step 0.05660196831087091 %forward: 23.22924473534836 %backward: 62.401321346536434 [2025-03-31 07:14:36,542] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20160.30 | forward: 145202.38 | backward_microstep: 390067.57 | backward: 390060.92 | backward_inner_microstep: 390044.42 | backward_inner: 390038.77 | backward_allreduce_microstep: 8.80 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.19 | step: 353.81 | _step_clipping: 0.15 | _step_step: 352.11 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.382 | iteration 8730/ 143000 | elapsed time per iteration (ms): 62509.0 | learning rate: 5.962E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.576107E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 07:25:02,330] [INFO] [logging.py:60:log_dist] [Rank 0] step=8740, skipped=2, lr=[0.0005961418161210283, 0.0005961418161210283], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8740 loss: 2.5468 iter time (s): 62.578 samples/sec: 16.363 %comms: 0.0028497065989374474 %optimizer_step 0.056000047395417044 %forward: 23.20227001950127 %backward: 62.32220772272127 [2025-03-31 07:25:02,331] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20939.73 | forward: 145195.98 | backward_microstep: 390008.26 | backward: 390002.11 | backward_inner_microstep: 389987.31 | backward_inner: 389981.88 | backward_allreduce_microstep: 7.17 | backward_allreduce: 2.47 | reduce_tied_grads: 0.43 | comms: 17.83 | reduce_grads: 0.18 | step: 350.44 | _step_clipping: 0.13 | _step_step: 348.78 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.363 | iteration 8740/ 143000 | elapsed time per iteration (ms): 62578.9 | learning rate: 5.961E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.567389E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 07:35:27,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=8750, skipped=2, lr=[0.0005961312728695624, 0.0005961312728695624], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8750 loss: 2.5503 iter time (s): 62.564 samples/sec: 16.367 %comms: 0.0028643474563124 %optimizer_step 0.05608151781953104 %forward: 23.234463142650892 %backward: 62.35697106156899 [2025-03-31 07:35:27,976] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20486.68 | forward: 145364.02 | backward_microstep: 390136.85 | backward: 390129.95 | backward_inner_microstep: 390114.60 | backward_inner: 390108.79 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.18 | step: 350.87 | _step_clipping: 0.11 | _step_step: 349.20 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.367 | iteration 8750/ 143000 | elapsed time per iteration (ms): 62564.5 | learning rate: 5.961E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.560465E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 07:45:50,500] [INFO] [logging.py:60:log_dist] [Rank 0] step=8760, skipped=2, lr=[0.0005961207153254734, 0.0005961207153254734], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8760 loss: 2.5602 iter time (s): 62.252 samples/sec: 16.449 %comms: 0.002876408155112402 %optimizer_step 0.05592998409840424 %forward: 23.32864503902662 %backward: 62.6647903123177 [2025-03-31 07:45:50,500] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17542.74 | forward: 145225.26 | backward_microstep: 390106.74 | backward: 390100.26 | backward_inner_microstep: 390085.53 | backward_inner: 390080.05 | backward_allreduce_microstep: 7.11 | backward_allreduce: 2.44 | reduce_tied_grads: 0.26 | comms: 17.91 | reduce_grads: 0.18 | step: 348.17 | _step_clipping: 0.13 | _step_step: 346.49 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.449 | iteration 8760/ 143000 | elapsed time per iteration (ms): 62252.4 | learning rate: 5.961E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.569890E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 07:56:13,982] [INFO] [logging.py:60:log_dist] [Rank 0] step=8770, skipped=2, lr=[0.0005961101434892709, 0.0005961101434892709], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8770 loss: 2.5661 iter time (s): 62.348 samples/sec: 16.424 %comms: 0.0028643803216208442 %optimizer_step 0.05613195009839774 %forward: 23.30926417168139 %backward: 62.57274567144524 [2025-03-31 07:56:13,982] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18369.49 | forward: 145327.83 | backward_microstep: 390133.27 | backward: 390126.48 | backward_inner_microstep: 390109.57 | backward_inner: 390103.86 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.51 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.19 | step: 349.97 | _step_clipping: 0.12 | _step_step: 348.29 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.424 | iteration 8770/ 143000 | elapsed time per iteration (ms): 62348.2 | learning rate: 5.961E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.573906E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 08:06:39,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=8780, skipped=2, lr=[0.000596099557361465, 0.000596099557361465], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8780 loss: 2.5858 iter time (s): 62.597 samples/sec: 16.359 %comms: 0.0028580174635162237 %optimizer_step 0.05650203438780766 %forward: 23.201074931764058 %backward: 62.32618793608993 [2025-03-31 08:06:39,953] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20947.11 | forward: 145230.82 | backward_microstep: 390147.25 | backward: 390140.69 | backward_inner_microstep: 390125.56 | backward_inner: 390119.94 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 353.68 | _step_clipping: 0.11 | _step_step: 352.10 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.359 | iteration 8780/ 143000 | elapsed time per iteration (ms): 62597.1 | learning rate: 5.961E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.588091E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 08:17:01,220] [INFO] [logging.py:60:log_dist] [Rank 0] step=8790, skipped=2, lr=[0.0005960889569425668, 0.0005960889569425668], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8790 loss: 2.5571 iter time (s): 62.126 samples/sec: 16.483 %comms: 0.002879732856578609 %optimizer_step 0.05613803936875155 %forward: 23.36657534549045 %backward: 62.78723822731263 [2025-03-31 08:17:01,221] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16362.26 | forward: 145167.70 | backward_microstep: 390079.51 | backward: 390073.38 | backward_inner_microstep: 390058.73 | backward_inner: 390046.43 | backward_allreduce_microstep: 7.07 | backward_allreduce: 2.42 | reduce_tied_grads: 0.26 | comms: 17.89 | reduce_grads: 0.18 | step: 348.76 | _step_clipping: 0.10 | _step_step: 347.12 | _step_zero_grad: 0.45 | _step_check_overflow: 0.56 samples/sec: 16.482 | iteration 8790/ 143000 | elapsed time per iteration (ms): 62126.8 | learning rate: 5.961E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.576643E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 08:27:27,809] [INFO] [logging.py:60:log_dist] [Rank 0] step=8800, skipped=2, lr=[0.0005960783422330878, 0.0005960783422330878], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8800 loss: 2.5697 iter time (s): 62.658 samples/sec: 16.343 %comms: 0.0028600351069727934 %optimizer_step 0.055969689210649744 %forward: 23.204594392906273 %backward: 62.25971327380192 [2025-03-31 08:27:27,809] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21391.07 | forward: 145396.05 | backward_microstep: 390116.36 | backward: 390108.79 | backward_inner_microstep: 390093.60 | backward_inner: 390087.77 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 350.70 | _step_clipping: 0.13 | _step_step: 349.05 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.342 | iteration 8800/ 143000 | elapsed time per iteration (ms): 62658.8 | learning rate: 5.961E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.565941E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 08:37:50,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=8810, skipped=2, lr=[0.0005960677132335404, 0.0005960677132335404], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8810 loss: 2.5853 iter time (s): 62.316 samples/sec: 16.432 %comms: 0.0029060302199986005 %optimizer_step 0.06129376519723387 %forward: 23.309239317089624 %backward: 62.612110685032064 [2025-03-31 08:37:50,970] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17994.01 | forward: 145252.77 | backward_microstep: 390177.75 | backward: 390170.73 | backward_inner_microstep: 390153.63 | backward_inner: 390147.91 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.19 | step: 381.96 | _step_clipping: 0.13 | _step_step: 380.14 | _step_zero_grad: 0.48 | _step_check_overflow: 0.61 samples/sec: 16.432 | iteration 8810/ 143000 | elapsed time per iteration (ms): 62316.1 | learning rate: 5.961E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.570299E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 08:48:23,183] [INFO] [logging.py:60:log_dist] [Rank 0] step=8820, skipped=2, lr=[0.0005960570699444375, 0.0005960570699444375], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8820 loss: 2.5723 iter time (s): 63.221 samples/sec: 16.197 %comms: 0.0028585364082553046 %optimizer_step 0.05584480986040751 %forward: 22.995790761675693 %backward: 61.71732707272515 [2025-03-31 08:48:23,183] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26923.90 | forward: 145381.18 | backward_microstep: 390189.81 | backward: 390181.75 | backward_inner_microstep: 390166.38 | backward_inner: 390160.44 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.53 | reduce_tied_grads: 0.29 | comms: 18.07 | reduce_grads: 0.19 | step: 353.06 | _step_clipping: 0.13 | _step_step: 351.33 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.197 | iteration 8820/ 143000 | elapsed time per iteration (ms): 63221.3 | learning rate: 5.961E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.567258E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 08:58:44,838] [INFO] [logging.py:60:log_dist] [Rank 0] step=8830, skipped=2, lr=[0.0005960464123662928, 0.0005960464123662928], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8830 loss: 2.5691 iter time (s): 62.165 samples/sec: 16.472 %comms: 0.0028960785241669445 %optimizer_step 0.05675220092350933 %forward: 23.36184279203892 %backward: 62.76713591286121 [2025-03-31 08:58:44,839] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16518.91 | forward: 145228.83 | backward_microstep: 390200.42 | backward: 390191.72 | backward_inner_microstep: 390176.58 | backward_inner: 390170.80 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.18 | step: 352.80 | _step_clipping: 0.13 | _step_step: 351.04 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.472 | iteration 8830/ 143000 | elapsed time per iteration (ms): 62165.5 | learning rate: 5.960E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.564978E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 09:09:10,893] [INFO] [logging.py:60:log_dist] [Rank 0] step=8840, skipped=2, lr=[0.0005960357404996208, 0.0005960357404996208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8840 loss: 2.5776 iter time (s): 62.605 samples/sec: 16.357 %comms: 0.0029228726868828748 %optimizer_step 0.05696482739121206 %forward: 23.208042176004064 %backward: 62.32744540706666 [2025-03-31 09:09:10,893] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20811.30 | forward: 145293.80 | backward_microstep: 390208.16 | backward: 390200.58 | backward_inner_microstep: 390185.31 | backward_inner: 390179.38 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 18.30 | reduce_grads: 0.20 | step: 356.63 | _step_clipping: 0.13 | _step_step: 354.93 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.356 | iteration 8840/ 143000 | elapsed time per iteration (ms): 62605.5 | learning rate: 5.960E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.559431E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 09:19:29,583] [INFO] [logging.py:60:log_dist] [Rank 0] step=8850, skipped=2, lr=[0.0005960250543449365, 0.0005960250543449365], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8850 loss: 2.5761 iter time (s): 61.868 samples/sec: 16.551 %comms: 0.0029094950279658733 %optimizer_step 0.05820905312923666 %forward: 23.471791628733083 %backward: 63.05557942454752 [2025-03-31 09:19:29,583] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13626.01 | forward: 145216.40 | backward_microstep: 390122.55 | backward: 390115.27 | backward_inner_microstep: 390100.16 | backward_inner: 390094.20 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.19 | step: 360.13 | _step_clipping: 0.12 | _step_step: 358.43 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.551 | iteration 8850/ 143000 | elapsed time per iteration (ms): 61869.0 | learning rate: 5.960E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.559613E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 09:30:00,345] [INFO] [logging.py:60:log_dist] [Rank 0] step=8860, skipped=2, lr=[0.0005960143539027556, 0.0005960143539027556], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8860 loss: 2.5754 iter time (s): 63.076 samples/sec: 16.234 %comms: 0.0028652283646898933 %optimizer_step 0.058337836634674935 %forward: 23.050293223608556 %backward: 61.84276977438658 [2025-03-31 09:30:00,345] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25557.74 | forward: 145391.15 | backward_microstep: 390084.90 | backward: 390077.10 | backward_inner_microstep: 390061.62 | backward_inner: 390055.65 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.54 | reduce_tied_grads: 0.35 | comms: 18.07 | reduce_grads: 0.22 | step: 367.97 | _step_clipping: 0.14 | _step_step: 366.16 | _step_zero_grad: 0.55 | _step_check_overflow: 0.52 samples/sec: 16.234 | iteration 8860/ 143000 | elapsed time per iteration (ms): 63076.2 | learning rate: 5.960E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.580865E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 09:40:22,866] [INFO] [logging.py:60:log_dist] [Rank 0] step=8870, skipped=2, lr=[0.0005960036391735946, 0.0005960036391735946], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8870 loss: 2.5689 iter time (s): 62.252 samples/sec: 16.449 %comms: 0.0029237204449631743 %optimizer_step 0.05704703708062121 %forward: 23.328615447352018 %backward: 62.658704211740655 [2025-03-31 09:40:22,867] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17519.23 | forward: 145224.43 | backward_microstep: 390067.56 | backward: 390060.64 | backward_inner_microstep: 390045.42 | backward_inner: 390039.64 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.52 | reduce_tied_grads: 0.30 | comms: 18.20 | reduce_grads: 0.20 | step: 355.13 | _step_clipping: 0.14 | _step_step: 353.36 | _step_zero_grad: 0.46 | _step_check_overflow: 0.60 samples/sec: 16.449 | iteration 8870/ 143000 | elapsed time per iteration (ms): 62252.2 | learning rate: 5.960E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.579922E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 09:50:48,979] [INFO] [logging.py:60:log_dist] [Rank 0] step=8880, skipped=2, lr=[0.0005959929101579706, 0.0005959929101579706], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8880 loss: 2.5528 iter time (s): 62.611 samples/sec: 16.355 %comms: 0.0028518129414420584 %optimizer_step 0.055817980912461956 %forward: 23.212029828070815 %backward: 62.31009059595607 [2025-03-31 09:50:48,979] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20954.70 | forward: 145332.19 | backward_microstep: 390135.15 | backward: 390127.96 | backward_inner_microstep: 390110.96 | backward_inner: 390105.19 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.50 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 349.48 | _step_clipping: 0.14 | _step_step: 347.89 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.355 | iteration 8880/ 143000 | elapsed time per iteration (ms): 62611.2 | learning rate: 5.960E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.565804E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 10:01:11,398] [INFO] [logging.py:60:log_dist] [Rank 0] step=8890, skipped=2, lr=[0.0005959821668564016, 0.0005959821668564016], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8890 loss: 2.5648 iter time (s): 62.241 samples/sec: 16.452 %comms: 0.0028850900830958075 %optimizer_step 0.056189004420198335 %forward: 23.332499685382952 %backward: 62.680859505944056 [2025-03-31 10:01:11,399] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17340.71 | forward: 145224.79 | backward_microstep: 390142.04 | backward: 390134.57 | backward_inner_microstep: 390117.42 | backward_inner: 390109.81 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.18 | step: 349.73 | _step_clipping: 0.12 | _step_step: 348.03 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.452 | iteration 8890/ 143000 | elapsed time per iteration (ms): 62241.9 | learning rate: 5.960E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.556842E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 10:11:40,926] [INFO] [logging.py:60:log_dist] [Rank 0] step=8900, skipped=2, lr=[0.0005959714092694061, 0.0005959714092694061], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8900 loss: 2.5473 iter time (s): 62.952 samples/sec: 16.266 %comms: 0.0028367568286080166 %optimizer_step 0.05492072112070438 %forward: 23.069429227986983 %backward: 61.959102631096194 [2025-03-31 10:11:40,927] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24555.71 | forward: 145227.30 | backward_microstep: 390053.51 | backward: 390046.62 | backward_inner_microstep: 390031.40 | backward_inner: 390025.69 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.49 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 345.74 | _step_clipping: 0.11 | _step_step: 344.16 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.266 | iteration 8900/ 143000 | elapsed time per iteration (ms): 62952.8 | learning rate: 5.960E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.558498E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 10:21:59,067] [INFO] [logging.py:60:log_dist] [Rank 0] step=8910, skipped=2, lr=[0.0005959606373975029, 0.0005959606373975029], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8910 loss: 2.5675 iter time (s): 61.814 samples/sec: 16.566 %comms: 0.002902207793019829 %optimizer_step 0.05689972697003068 %forward: 23.486425399973875 %backward: 63.12180460076667 [2025-03-31 10:21:59,068] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13058.64 | forward: 145177.86 | backward_microstep: 390185.36 | backward: 390178.08 | backward_inner_microstep: 390162.78 | backward_inner: 390156.90 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.47 | reduce_tied_grads: 0.33 | comms: 17.94 | reduce_grads: 0.19 | step: 351.72 | _step_clipping: 0.11 | _step_step: 350.06 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.566 | iteration 8910/ 143000 | elapsed time per iteration (ms): 61814.1 | learning rate: 5.960E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.558110E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 10:32:28,228] [INFO] [logging.py:60:log_dist] [Rank 0] step=8920, skipped=2, lr=[0.0005959498512412125, 0.0005959498512412125], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8920 loss: 2.5415 iter time (s): 62.915 samples/sec: 16.276 %comms: 0.002876196158621319 %optimizer_step 0.05679747512487937 %forward: 23.088795382296645 %backward: 62.00373899588993 [2025-03-31 10:32:28,228] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24055.29 | forward: 145264.31 | backward_microstep: 390106.57 | backward: 390099.62 | backward_inner_microstep: 390084.53 | backward_inner: 390078.71 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.48 | reduce_tied_grads: 0.30 | comms: 18.10 | reduce_grads: 0.19 | step: 357.34 | _step_clipping: 0.13 | _step_step: 355.50 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.276 | iteration 8920/ 143000 | elapsed time per iteration (ms): 62916.1 | learning rate: 5.959E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.553870E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 10:42:59,435] [INFO] [logging.py:60:log_dist] [Rank 0] step=8930, skipped=2, lr=[0.0005959390508010551, 0.0005959390508010551], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8930 loss: 2.5786 iter time (s): 63.120 samples/sec: 16.223 %comms: 0.0028354793462047755 %optimizer_step 0.05519375299162703 %forward: 23.028839872029803 %backward: 61.79563425088759 [2025-03-31 10:42:59,436] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26057.22 | forward: 145358.53 | backward_microstep: 390064.34 | backward: 390055.36 | backward_inner_microstep: 390038.68 | backward_inner: 390033.09 | backward_allreduce_microstep: 8.80 | backward_allreduce: 2.43 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 348.38 | _step_clipping: 0.12 | _step_step: 346.68 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.223 | iteration 8930/ 143000 | elapsed time per iteration (ms): 63120.8 | learning rate: 5.959E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.564317E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 10:53:27,170] [INFO] [logging.py:60:log_dist] [Rank 0] step=8940, skipped=2, lr=[0.0005959282360775521, 0.0005959282360775521], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8940 loss: 2.5568 iter time (s): 62.773 samples/sec: 16.313 %comms: 0.002851129734035587 %optimizer_step 0.05579827284852666 %forward: 23.135204234888466 %backward: 62.140802879546 [2025-03-31 10:53:27,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22691.05 | forward: 145226.38 | backward_microstep: 390082.86 | backward: 390075.83 | backward_inner_microstep: 390060.90 | backward_inner: 390055.23 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.46 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 350.26 | _step_clipping: 0.13 | _step_step: 348.57 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.313 | iteration 8940/ 143000 | elapsed time per iteration (ms): 62773.4 | learning rate: 5.959E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.558255E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:03:53,098] [INFO] [logging.py:60:log_dist] [Rank 0] step=8950, skipped=2, lr=[0.0005959174070712254, 0.0005959174070712254], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8950 loss: 2.5671 iter time (s): 62.592 samples/sec: 16.360 %comms: 0.002853146925641741 %optimizer_step 0.055792194165585896 %forward: 23.192509882306553 %backward: 62.30673661516592 [2025-03-31 11:03:53,099] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21046.58 | forward: 145167.27 | backward_microstep: 389998.72 | backward: 389992.25 | backward_inner_microstep: 389977.54 | backward_inner: 389972.01 | backward_allreduce_microstep: 7.08 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.19 | step: 349.22 | _step_clipping: 0.14 | _step_step: 347.57 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.360 | iteration 8950/ 143000 | elapsed time per iteration (ms): 62592.8 | learning rate: 5.959E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.563922E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:14:15,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=8960, skipped=2, lr=[0.0005959065637825976, 0.0005959065637825976], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8960 loss: 2.5666 iter time (s): 62.198 samples/sec: 16.464 %comms: 0.0028751953133397184 %optimizer_step 0.05686245255749488 %forward: 23.339260438694293 %backward: 62.71107338280113 [2025-03-31 11:14:15,081] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17021.99 | forward: 145164.90 | backward_microstep: 390055.38 | backward: 390048.65 | backward_inner_microstep: 390033.60 | backward_inner: 390027.84 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.49 | reduce_tied_grads: 0.25 | comms: 17.88 | reduce_grads: 0.18 | step: 353.67 | _step_clipping: 0.11 | _step_step: 352.09 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.463 | iteration 8960/ 143000 | elapsed time per iteration (ms): 62198.2 | learning rate: 5.959E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.575670E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:24:36,882] [INFO] [logging.py:60:log_dist] [Rank 0] step=8970, skipped=2, lr=[0.0005958957062121923, 0.0005958957062121923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8970 loss: 2.5782 iter time (s): 62.180 samples/sec: 16.468 %comms: 0.0028889924211611853 %optimizer_step 0.057207686436249565 %forward: 23.35163475107584 %backward: 62.74849629296087 [2025-03-31 11:24:36,883] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16637.81 | forward: 145199.60 | backward_microstep: 390176.75 | backward: 390167.83 | backward_inner_microstep: 390152.54 | backward_inner: 390146.53 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.51 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.19 | step: 355.72 | _step_clipping: 0.12 | _step_step: 354.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.468 | iteration 8970/ 143000 | elapsed time per iteration (ms): 62180.2 | learning rate: 5.959E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.559274E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:34:59,244] [INFO] [logging.py:60:log_dist] [Rank 0] step=8980, skipped=2, lr=[0.0005958848343605333, 0.0005958848343605333], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8980 loss: 2.5681 iter time (s): 62.236 samples/sec: 16.454 %comms: 0.0028741327774587 %optimizer_step 0.0562269631242551 %forward: 23.347869227771543 %backward: 62.67658312158962 [2025-03-31 11:34:59,245] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17232.85 | forward: 145307.00 | backward_microstep: 390079.24 | backward: 390071.83 | backward_inner_microstep: 390056.44 | backward_inner: 390050.63 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.52 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 349.93 | _step_clipping: 0.11 | _step_step: 348.29 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.453 | iteration 8980/ 143000 | elapsed time per iteration (ms): 62236.3 | learning rate: 5.959E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.567254E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:45:16,677] [INFO] [logging.py:60:log_dist] [Rank 0] step=8990, skipped=2, lr=[0.0005958739482281455, 0.0005958739482281455], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 8990 loss: 2.5739 iter time (s): 61.743 samples/sec: 16.585 %comms: 0.002892950145586409 %optimizer_step 0.05724077856202799 %forward: 23.504382530438132 %backward: 63.17577023896934 [2025-03-31 11:45:16,677] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12493.12 | forward: 145122.30 | backward_microstep: 390070.01 | backward: 390064.00 | backward_inner_microstep: 390047.71 | backward_inner: 390042.23 | backward_allreduce_microstep: 8.71 | backward_allreduce: 2.44 | reduce_tied_grads: 0.25 | comms: 17.86 | reduce_grads: 0.18 | step: 353.42 | _step_clipping: 0.11 | _step_step: 351.78 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.585 | iteration 8990/ 143000 | elapsed time per iteration (ms): 61743.2 | learning rate: 5.959E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.557686E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:55:42,999] [INFO] [logging.py:60:log_dist] [Rank 0] step=9000, skipped=2, lr=[0.0005958630478155542, 0.0005958630478155542], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9000 loss: 2.5754 iter time (s): 62.632 samples/sec: 16.350 %comms: 0.002848766110458318 %optimizer_step 0.0547906312069039 %forward: 23.20008317532165 %backward: 62.27904365394432 [2025-03-31 11:55:42,999] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21192.43 | forward: 145305.95 | backward_microstep: 390071.08 | backward: 390063.94 | backward_inner_microstep: 390047.30 | backward_inner: 390041.62 | backward_allreduce_microstep: 8.88 | backward_allreduce: 2.46 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.18 | step: 343.16 | _step_clipping: 0.12 | _step_step: 341.54 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.349 | iteration 9000/ 143000 | elapsed time per iteration (ms): 62632.2 | learning rate: 5.959E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.567156E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 11:55:45,797] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step9000/mp_rank_00_model_states.pt [2025-03-31 11:55:59,657] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-03-31 11:55:59,665] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step9000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-03-31 12:06:26,479] [INFO] [logging.py:60:log_dist] [Rank 0] step=9010, skipped=2, lr=[0.0005958521331232854, 0.0005958521331232854], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9010 loss: 2.5725 iter time (s): 62.680 samples/sec: 16.337 %comms: 0.0028557840532828275 %optimizer_step 0.056779048645086566 %forward: 23.180772194863778 %backward: 62.23763440655139 [2025-03-31 12:06:26,480] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21600.89 | forward: 145296.56 | backward_microstep: 390111.46 | backward: 390104.09 | backward_inner_microstep: 390088.59 | backward_inner: 390082.73 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 355.89 | _step_clipping: 0.12 | _step_step: 354.21 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 15.913 | iteration 9010/ 143000 | elapsed time per iteration (ms): 64348.1 | learning rate: 5.959E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.568322E+00 | loss scale: 1048576.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 12:16:52,495] [INFO] [logging.py:60:log_dist] [Rank 0] step=9020, skipped=2, lr=[0.0005958412041518662, 0.0005958412041518662], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9020 loss: 2.5584 iter time (s): 62.601 samples/sec: 16.358 %comms: 0.0028514151961632527 %optimizer_step 0.055352013490055994 %forward: 23.206876410107107 %backward: 62.31184046924852 [2025-03-31 12:16:52,496] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20881.29 | forward: 145277.50 | backward_microstep: 390085.07 | backward: 390078.71 | backward_inner_microstep: 390063.81 | backward_inner: 390058.20 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.47 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 346.51 | _step_clipping: 0.12 | _step_step: 344.82 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.357 | iteration 9020/ 143000 | elapsed time per iteration (ms): 62601.6 | learning rate: 5.958E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.555411E+00 | loss scale: 1048576.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 12:24:09,142] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 1048576.0 [2025-03-31 12:25:15,722] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 524288.0 [2025-03-31 12:27:19,374] [INFO] [logging.py:60:log_dist] [Rank 0] step=9030, skipped=4, lr=[0.0005958324506940968, 0.0005958324506940968], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9030 loss: 2.5469 iter time (s): 62.687 samples/sec: 16.335 %comms: 0.002295516298307771 %optimizer_step 0.04651478352677914 %forward: 23.168655012627788 %backward: 62.23580679155957 [2025-03-31 12:27:19,375] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21758.01 | forward: 145238.24 | backward_microstep: 390146.96 | backward: 390140.01 | backward_inner_microstep: 390125.18 | backward_inner: 390119.27 | backward_allreduce_microstep: 7.14 | backward_allreduce: 2.45 | reduce_tied_grads: 0.26 | comms: 14.39 | reduce_grads: 0.18 | step: 291.59 | _step_clipping: 0.13 | _step_step: 290.00 | _step_zero_grad: 0.45 | _step_check_overflow: 0.47 samples/sec: 16.335 | iteration 9030/ 143000 | elapsed time per iteration (ms): 62687.9 | learning rate: 5.958E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.553328E+00 | loss scale: 524288.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-03-31 12:37:40,948] [INFO] [logging.py:60:log_dist] [Rank 0] step=9040, skipped=4, lr=[0.0005958214960215361, 0.0005958214960215361], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9040 loss: 2.5494 iter time (s): 62.157 samples/sec: 16.474 %comms: 0.0028790043945044522 %optimizer_step 0.05650093591712199 %forward: 23.351056092166928 %backward: 62.76106007260904 [2025-03-31 12:37:40,949] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16524.15 | forward: 145142.80 | backward_microstep: 390109.56 | backward: 390102.95 | backward_inner_microstep: 390088.22 | backward_inner: 390082.67 | backward_allreduce_microstep: 7.10 | backward_allreduce: 2.43 | reduce_tied_grads: 0.25 | comms: 17.89 | reduce_grads: 0.19 | step: 351.19 | _step_clipping: 0.13 | _step_step: 349.48 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.474 | iteration 9040/ 143000 | elapsed time per iteration (ms): 62157.4 | learning rate: 5.958E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.556691E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 12:48:02,330] [INFO] [logging.py:60:log_dist] [Rank 0] step=9050, skipped=4, lr=[0.0005958105270713036, 0.0005958105270713036], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9050 loss: 2.5598 iter time (s): 62.138 samples/sec: 16.480 %comms: 0.0028730646385043725 %optimizer_step 0.05877552252740328 %forward: 23.360587605860754 %backward: 62.78708105303452 [2025-03-31 12:48:02,330] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16268.98 | forward: 145157.17 | backward_microstep: 390155.97 | backward: 390144.09 | backward_inner_microstep: 390127.66 | backward_inner: 390120.28 | backward_allreduce_microstep: 7.04 | backward_allreduce: 2.42 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 365.22 | _step_clipping: 0.11 | _step_step: 363.57 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.479 | iteration 9050/ 143000 | elapsed time per iteration (ms): 62138.2 | learning rate: 5.958E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.549138E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 12:58:24,727] [INFO] [logging.py:60:log_dist] [Rank 0] step=9060, skipped=4, lr=[0.0005957995438439286, 0.0005957995438439286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9060 loss: 2.5425 iter time (s): 62.239 samples/sec: 16.453 %comms: 0.00286898704521359 %optimizer_step 0.0558764675400243 %forward: 23.333011267718597 %backward: 62.678242758501256 [2025-03-31 12:58:24,728] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17267.37 | forward: 145222.91 | backward_microstep: 390111.18 | backward: 390104.67 | backward_inner_microstep: 390089.73 | backward_inner: 390084.13 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.86 | reduce_grads: 0.18 | step: 347.77 | _step_clipping: 0.12 | _step_step: 346.11 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.453 | iteration 9060/ 143000 | elapsed time per iteration (ms): 62239.8 | learning rate: 5.958E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.548786E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 13:08:49,876] [INFO] [logging.py:60:log_dist] [Rank 0] step=9070, skipped=4, lr=[0.000595788546339941, 0.000595788546339941], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9070 loss: 2.5632 iter time (s): 62.514 samples/sec: 16.380 %comms: 0.00285651424454151 %optimizer_step 0.055169522926847125 %forward: 23.221636314004467 %backward: 62.39658504738449 [2025-03-31 13:08:49,877] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20173.36 | forward: 145168.55 | backward_microstep: 390076.61 | backward: 390068.20 | backward_inner_microstep: 390051.47 | backward_inner: 390043.96 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.19 | step: 344.89 | _step_clipping: 0.13 | _step_step: 343.31 | _step_zero_grad: 0.45 | _step_check_overflow: 0.46 samples/sec: 16.380 | iteration 9070/ 143000 | elapsed time per iteration (ms): 62514.9 | learning rate: 5.958E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.562212E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 13:19:12,814] [INFO] [logging.py:60:log_dist] [Rank 0] step=9080, skipped=4, lr=[0.0005957775345598721, 0.0005957775345598721], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9080 loss: 2.5398 iter time (s): 62.293 samples/sec: 16.438 %comms: 0.0028961616091644333 %optimizer_step 0.05846499467275242 %forward: 23.322256555581696 %backward: 62.63571314459031 [2025-03-31 13:19:12,814] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17672.21 | forward: 145281.93 | backward_microstep: 390185.64 | backward: 390178.26 | backward_inner_microstep: 390160.99 | backward_inner: 390155.09 | backward_allreduce_microstep: 9.28 | backward_allreduce: 2.57 | reduce_tied_grads: 0.27 | comms: 18.04 | reduce_grads: 0.18 | step: 364.20 | _step_clipping: 0.11 | _step_step: 362.41 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.438 | iteration 9080/ 143000 | elapsed time per iteration (ms): 62293.8 | learning rate: 5.958E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.552485E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 13:29:37,695] [INFO] [logging.py:60:log_dist] [Rank 0] step=9090, skipped=4, lr=[0.0005957665085042529, 0.0005957665085042529], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9090 loss: 2.5400 iter time (s): 62.488 samples/sec: 16.387 %comms: 0.0028947124712622153 %optimizer_step 0.059227141889406726 %forward: 23.270379852615616 %backward: 62.43645063456008 [2025-03-31 13:29:37,696] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19500.17 | forward: 145410.84 | backward_microstep: 390158.34 | backward: 390149.90 | backward_inner_microstep: 390133.75 | backward_inner: 390127.57 | backward_allreduce_microstep: 7.79 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 18.09 | reduce_grads: 0.20 | step: 370.10 | _step_clipping: 0.15 | _step_step: 368.13 | _step_zero_grad: 0.55 | _step_check_overflow: 0.66 samples/sec: 16.387 | iteration 9090/ 143000 | elapsed time per iteration (ms): 62488.1 | learning rate: 5.958E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.541486E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 13:40:00,892] [INFO] [logging.py:60:log_dist] [Rank 0] step=9100, skipped=4, lr=[0.0005957554681736158, 0.0005957554681736158], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9100 loss: 2.5463 iter time (s): 62.319 samples/sec: 16.432 %comms: 0.002880385895577134 %optimizer_step 0.0567776400412517 %forward: 23.31475619161643 %backward: 62.6003033443578 [2025-03-31 13:40:00,892] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18000.93 | forward: 145295.39 | backward_microstep: 390126.79 | backward: 390119.26 | backward_inner_microstep: 390103.65 | backward_inner: 390097.72 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.18 | step: 353.83 | _step_clipping: 0.11 | _step_step: 352.15 | _step_zero_grad: 0.45 | _step_check_overflow: 0.57 samples/sec: 16.431 | iteration 9100/ 143000 | elapsed time per iteration (ms): 62319.6 | learning rate: 5.958E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.552540E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 13:50:31,705] [INFO] [logging.py:60:log_dist] [Rank 0] step=9110, skipped=4, lr=[0.0005957444135684935, 0.0005957444135684935], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9110 loss: 2.5360 iter time (s): 63.081 samples/sec: 16.233 %comms: 0.0028627641036139293 %optimizer_step 0.05623407848494319 %forward: 23.09455964464259 %backward: 61.87947552752596 [2025-03-31 13:50:31,706] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24945.81 | forward: 145682.28 | backward_microstep: 390350.86 | backward: 390340.54 | backward_inner_microstep: 390323.70 | backward_inner: 390317.24 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.77 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.19 | step: 354.73 | _step_clipping: 0.13 | _step_step: 352.94 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.233 | iteration 9110/ 143000 | elapsed time per iteration (ms): 63081.3 | learning rate: 5.957E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.552073E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 14:00:59,951] [INFO] [logging.py:60:log_dist] [Rank 0] step=9120, skipped=4, lr=[0.0005957333446894198, 0.0005957333446894198], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9120 loss: 2.5712 iter time (s): 62.824 samples/sec: 16.299 %comms: 0.0028869865741500173 %optimizer_step 0.057262317800053614 %forward: 23.172630998400983 %backward: 62.117987626716264 [2025-03-31 14:00:59,952] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22605.40 | forward: 145579.83 | backward_microstep: 390259.35 | backward: 390250.31 | backward_inner_microstep: 390234.11 | backward_inner: 390227.94 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.69 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.20 | step: 359.75 | _step_clipping: 0.12 | _step_step: 357.94 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.299 | iteration 9120/ 143000 | elapsed time per iteration (ms): 62824.6 | learning rate: 5.957E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.552308E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 14:11:30,618] [INFO] [logging.py:60:log_dist] [Rank 0] step=9130, skipped=4, lr=[0.0005957222615369286, 0.0005957222615369286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9130 loss: 2.5543 iter time (s): 63.066 samples/sec: 16.237 %comms: 0.002860484263310174 %optimizer_step 0.05661259957989249 %forward: 23.117918388299803 %backward: 61.918276260607165 [2025-03-31 14:11:30,618] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24532.49 | forward: 145795.56 | backward_microstep: 390504.77 | backward: 390494.06 | backward_inner_microstep: 390477.54 | backward_inner: 390471.13 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.67 | reduce_tied_grads: 0.34 | comms: 18.04 | reduce_grads: 0.20 | step: 357.03 | _step_clipping: 0.14 | _step_step: 355.11 | _step_zero_grad: 0.52 | _step_check_overflow: 0.65 samples/sec: 16.237 | iteration 9130/ 143000 | elapsed time per iteration (ms): 63066.6 | learning rate: 5.957E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.551029E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 14:21:55,326] [INFO] [logging.py:60:log_dist] [Rank 0] step=9140, skipped=4, lr=[0.0005957111641115553, 0.0005957111641115553], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9140 loss: 2.5573 iter time (s): 62.470 samples/sec: 16.392 %comms: 0.002905665928645942 %optimizer_step 0.05674067059974417 %forward: 23.26300274301596 %backward: 62.473106150519406 [2025-03-31 14:21:55,326] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19306.56 | forward: 145324.50 | backward_microstep: 390279.29 | backward: 390270.89 | backward_inner_microstep: 390254.73 | backward_inner: 390248.55 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.65 | reduce_tied_grads: 0.27 | comms: 18.15 | reduce_grads: 0.19 | step: 354.46 | _step_clipping: 0.13 | _step_step: 352.69 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.392 | iteration 9140/ 143000 | elapsed time per iteration (ms): 62470.8 | learning rate: 5.957E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.553265E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 14:32:25,217] [INFO] [logging.py:60:log_dist] [Rank 0] step=9150, skipped=4, lr=[0.0005957000524138349, 0.0005957000524138349], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9150 loss: 2.5335 iter time (s): 62.989 samples/sec: 16.257 %comms: 0.0028806173645386166 %optimizer_step 0.057577335056695295 %forward: 23.122365992725598 %backward: 61.97645738923979 [2025-03-31 14:32:25,218] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24027.74 | forward: 145644.56 | backward_microstep: 390389.52 | backward: 390381.07 | backward_inner_microstep: 390364.60 | backward_inner: 390358.29 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.75 | reduce_tied_grads: 0.31 | comms: 18.14 | reduce_grads: 0.19 | step: 362.67 | _step_clipping: 0.12 | _step_step: 360.93 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.257 | iteration 9150/ 143000 | elapsed time per iteration (ms): 62989.2 | learning rate: 5.957E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.549600E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 14:42:49,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=9160, skipped=4, lr=[0.0005956889264443041, 0.0005956889264443041], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9160 loss: 2.5252 iter time (s): 62.378 samples/sec: 16.416 %comms: 0.0029061384930385526 %optimizer_step 0.0571504896836672 %forward: 23.30630762514658 %backward: 62.58795411389024 [2025-03-31 14:42:49,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18161.53 | forward: 145380.15 | backward_microstep: 390420.70 | backward: 390411.30 | backward_inner_microstep: 390395.33 | backward_inner: 390387.30 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.57 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.20 | step: 356.49 | _step_clipping: 0.13 | _step_step: 354.72 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.416 | iteration 9160/ 143000 | elapsed time per iteration (ms): 62378.6 | learning rate: 5.957E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.535236E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 14:53:14,747] [INFO] [logging.py:60:log_dist] [Rank 0] step=9170, skipped=4, lr=[0.0005956777862035, 0.0005956777862035], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9170 loss: 2.5624 iter time (s): 62.574 samples/sec: 16.365 %comms: 0.0028865292661336087 %optimizer_step 0.05660901564715396 %forward: 23.27101548256944 %backward: 62.38628204700696 [2025-03-31 14:53:14,748] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19939.37 | forward: 145615.62 | backward_microstep: 390383.69 | backward: 390374.76 | backward_inner_microstep: 390358.77 | backward_inner: 390352.37 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.67 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.19 | step: 354.22 | _step_clipping: 0.11 | _step_step: 352.43 | _step_zero_grad: 0.49 | _step_check_overflow: 0.63 samples/sec: 16.365 | iteration 9170/ 143000 | elapsed time per iteration (ms): 62574.4 | learning rate: 5.957E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.549332E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 15:03:42,700] [INFO] [logging.py:60:log_dist] [Rank 0] step=9180, skipped=4, lr=[0.0005956666316919597, 0.0005956666316919597], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9180 loss: 2.5631 iter time (s): 62.795 samples/sec: 16.307 %comms: 0.002857467743447282 %optimizer_step 0.05725798116729805 %forward: 23.239130514102605 %backward: 62.18508530853006 [2025-03-31 15:03:42,701] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21675.77 | forward: 145929.42 | backward_microstep: 390500.99 | backward: 390489.36 | backward_inner_microstep: 390472.23 | backward_inner: 390465.70 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.84 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.20 | step: 359.55 | _step_clipping: 0.11 | _step_step: 357.84 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.307 | iteration 9180/ 143000 | elapsed time per iteration (ms): 62795.3 | learning rate: 5.957E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.557682E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 15:14:09,155] [INFO] [logging.py:60:log_dist] [Rank 0] step=9190, skipped=4, lr=[0.0005956554629102223, 0.0005956554629102223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9190 loss: 2.5528 iter time (s): 62.645 samples/sec: 16.346 %comms: 0.002914117328114301 %optimizer_step 0.06360424814725558 %forward: 23.272058965801822 %backward: 62.35260847529158 [2025-03-31 15:14:09,156] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20144.95 | forward: 145787.69 | backward_microstep: 390617.82 | backward: 390607.58 | backward_inner_microstep: 390590.62 | backward_inner: 390583.86 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.76 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.21 | step: 398.45 | _step_clipping: 0.14 | _step_step: 388.99 | _step_zero_grad: 0.53 | _step_check_overflow: 0.63 samples/sec: 16.346 | iteration 9190/ 143000 | elapsed time per iteration (ms): 62645.5 | learning rate: 5.957E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.559484E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 15:24:44,105] [INFO] [logging.py:60:log_dist] [Rank 0] step=9200, skipped=4, lr=[0.0005956442798588263, 0.0005956442798588263], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9200 loss: 2.5549 iter time (s): 63.494 samples/sec: 16.127 %comms: 0.0028449444735723474 %optimizer_step 0.05677666556513826 %forward: 22.943927115458212 %backward: 61.54477879724619 [2025-03-31 15:24:44,105] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28601.61 | forward: 145680.92 | backward_microstep: 390788.19 | backward: 390774.43 | backward_inner_microstep: 390757.02 | backward_inner: 390750.25 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.82 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.21 | step: 360.50 | _step_clipping: 0.15 | _step_step: 358.66 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.127 | iteration 9200/ 143000 | elapsed time per iteration (ms): 63494.9 | learning rate: 5.956E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.559876E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 15:35:15,230] [INFO] [logging.py:60:log_dist] [Rank 0] step=9210, skipped=4, lr=[0.0005956330825383116, 0.0005956330825383116], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9210 loss: 2.5301 iter time (s): 63.112 samples/sec: 16.225 %comms: 0.0028631636514886228 %optimizer_step 0.05591946627948874 %forward: 23.042688859237984 %backward: 61.86149881878019 [2025-03-31 15:35:15,230] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25410.46 | forward: 145426.89 | backward_microstep: 390430.07 | backward: 390419.95 | backward_inner_microstep: 390402.93 | backward_inner: 390396.59 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.82 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.21 | step: 352.92 | _step_clipping: 0.13 | _step_step: 351.18 | _step_zero_grad: 0.53 | _step_check_overflow: 0.49 samples/sec: 16.225 | iteration 9210/ 143000 | elapsed time per iteration (ms): 63112.5 | learning rate: 5.956E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.540868E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 15:45:38,099] [INFO] [logging.py:60:log_dist] [Rank 0] step=9220, skipped=4, lr=[0.0005956218709492187, 0.0005956218709492187], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9220 loss: 2.5484 iter time (s): 62.286 samples/sec: 16.440 %comms: 0.002889398476680568 %optimizer_step 0.055790558242491996 %forward: 23.322704959412043 %backward: 62.65829290700843 [2025-03-31 15:45:38,100] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17491.22 | forward: 145268.76 | backward_microstep: 390284.89 | backward: 390276.03 | backward_inner_microstep: 390260.44 | backward_inner: 390254.34 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.54 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.18 | step: 347.50 | _step_clipping: 0.12 | _step_step: 345.83 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.440 | iteration 9220/ 143000 | elapsed time per iteration (ms): 62287.0 | learning rate: 5.956E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.547764E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 15:55:58,989] [INFO] [logging.py:60:log_dist] [Rank 0] step=9230, skipped=4, lr=[0.0005956106450920889, 0.0005956106450920889], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9230 loss: 2.5200 iter time (s): 62.088 samples/sec: 16.493 %comms: 0.002871846033436003 %optimizer_step 0.055110812933235505 %forward: 23.383477715276186 %backward: 62.822123883157346 [2025-03-31 15:55:58,990] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15835.02 | forward: 145184.40 | backward_microstep: 390059.11 | backward: 390052.87 | backward_inner_microstep: 390038.05 | backward_inner: 390032.44 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.50 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 342.17 | _step_clipping: 0.11 | _step_step: 340.61 | _step_zero_grad: 0.45 | _step_check_overflow: 0.48 samples/sec: 16.492 | iteration 9230/ 143000 | elapsed time per iteration (ms): 62089.0 | learning rate: 5.956E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.543333E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 16:06:23,130] [INFO] [logging.py:60:log_dist] [Rank 0] step=9240, skipped=4, lr=[0.0005955994049674637, 0.0005955994049674637], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9240 loss: 2.5291 iter time (s): 62.413 samples/sec: 16.407 %comms: 0.0029088800820661126 %optimizer_step 0.05902832338618243 %forward: 23.306935345909487 %backward: 62.53066989888977 [2025-03-31 16:06:23,130] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18522.81 | forward: 145464.81 | backward_microstep: 390278.62 | backward: 390270.62 | backward_inner_microstep: 390253.00 | backward_inner: 390246.88 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.32 | comms: 18.16 | reduce_grads: 0.20 | step: 368.41 | _step_clipping: 0.15 | _step_step: 366.58 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.407 | iteration 9240/ 143000 | elapsed time per iteration (ms): 62414.0 | learning rate: 5.956E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.541604E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 16:16:46,446] [INFO] [logging.py:60:log_dist] [Rank 0] step=9250, skipped=4, lr=[0.0005955881505758856, 0.0005955881505758856], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9250 loss: 2.5391 iter time (s): 62.331 samples/sec: 16.428 %comms: 0.002864261023423368 %optimizer_step 0.0554334838641439 %forward: 23.31561430645037 %backward: 62.60771173168232 [2025-03-31 16:16:46,447] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17903.25 | forward: 145328.84 | backward_microstep: 390249.33 | backward: 390240.89 | backward_inner_microstep: 390222.83 | backward_inner: 390216.67 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.74 | reduce_tied_grads: 0.27 | comms: 17.85 | reduce_grads: 0.19 | step: 345.52 | _step_clipping: 0.12 | _step_step: 343.90 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.428 | iteration 9250/ 143000 | elapsed time per iteration (ms): 62331.6 | learning rate: 5.956E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.538487E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 16:27:13,021] [INFO] [logging.py:60:log_dist] [Rank 0] step=9260, skipped=4, lr=[0.0005955768819178981, 0.0005955768819178981], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9260 loss: 2.5281 iter time (s): 62.657 samples/sec: 16.343 %comms: 0.0028745204025207097 %optimizer_step 0.05588204952118256 %forward: 23.198934049920606 %backward: 62.28228241947317 [2025-03-31 16:27:13,021] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21094.05 | forward: 145357.34 | backward_microstep: 390250.11 | backward: 390241.50 | backward_inner_microstep: 390225.56 | backward_inner: 390219.42 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.36 | comms: 18.01 | reduce_grads: 0.19 | step: 350.14 | _step_clipping: 0.12 | _step_step: 348.48 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.343 | iteration 9260/ 143000 | elapsed time per iteration (ms): 62657.5 | learning rate: 5.956E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.541273E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 16:37:39,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=9270, skipped=4, lr=[0.0005955655989940447, 0.0005955655989940447], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9270 loss: 2.5574 iter time (s): 62.696 samples/sec: 16.333 %comms: 0.0028757480059302763 %optimizer_step 0.05690187385598872 %forward: 23.186216463675663 %backward: 62.2375088773537 [2025-03-31 16:37:39,984] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21547.14 | forward: 145367.50 | backward_microstep: 390209.89 | backward: 390202.12 | backward_inner_microstep: 390184.55 | backward_inner: 390178.43 | backward_allreduce_microstep: 9.37 | backward_allreduce: 2.59 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.21 | step: 356.75 | _step_clipping: 0.13 | _step_step: 355.06 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.333 | iteration 9270/ 143000 | elapsed time per iteration (ms): 62696.2 | learning rate: 5.956E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.554597E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 16:48:06,645] [INFO] [logging.py:60:log_dist] [Rank 0] step=9280, skipped=4, lr=[0.0005955543018048701, 0.0005955543018048701], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9280 loss: 2.5524 iter time (s): 62.666 samples/sec: 16.341 %comms: 0.0028671203869555678 %optimizer_step 0.05720186523311453 %forward: 23.183470275854834 %backward: 62.251115301136984 [2025-03-31 16:48:06,645] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21441.16 | forward: 145280.64 | backward_microstep: 390108.06 | backward: 390100.43 | backward_inner_microstep: 390084.66 | backward_inner: 390078.80 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.56 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.20 | step: 358.46 | _step_clipping: 0.13 | _step_step: 356.73 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.341 | iteration 9280/ 143000 | elapsed time per iteration (ms): 62666.2 | learning rate: 5.956E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.544019E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 16:58:29,517] [INFO] [logging.py:60:log_dist] [Rank 0] step=9290, skipped=4, lr=[0.0005955429903509198, 0.0005955429903509198], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9290 loss: 2.5613 iter time (s): 62.287 samples/sec: 16.440 %comms: 0.0034718596876588677 %optimizer_step 0.057418355191870045 %forward: 23.311402751697035 %backward: 62.630592969420206 [2025-03-31 16:58:29,517] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17736.89 | forward: 145198.86 | backward_microstep: 390112.27 | backward: 390104.82 | backward_inner_microstep: 390089.36 | backward_inner: 390083.45 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.27 | comms: 21.63 | reduce_grads: 0.19 | step: 357.64 | _step_clipping: 0.14 | _step_step: 355.89 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.440 | iteration 9290/ 143000 | elapsed time per iteration (ms): 62287.2 | learning rate: 5.955E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.544919E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 17:08:55,250] [INFO] [logging.py:60:log_dist] [Rank 0] step=9300, skipped=4, lr=[0.0005955316646327393, 0.0005955316646327393], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9300 loss: 2.5405 iter time (s): 62.573 samples/sec: 16.365 %comms: 0.0028686301562901207 %optimizer_step 0.05634878636727152 %forward: 23.20869981245392 %backward: 62.35703546807825 [2025-03-31 17:08:55,251] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20467.98 | forward: 145223.31 | backward_microstep: 390195.29 | backward: 390185.38 | backward_inner_microstep: 390163.10 | backward_inner: 390155.38 | backward_allreduce_microstep: 12.47 | backward_allreduce: 4.26 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.20 | step: 352.59 | _step_clipping: 0.12 | _step_step: 350.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.365 | iteration 9300/ 143000 | elapsed time per iteration (ms): 62573.3 | learning rate: 5.955E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.542938E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 17:19:25,545] [INFO] [logging.py:60:log_dist] [Rank 0] step=9310, skipped=4, lr=[0.0005955203246508755, 0.0005955203246508755], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9310 loss: 2.5549 iter time (s): 63.029 samples/sec: 16.247 %comms: 0.00284817218337356 %optimizer_step 0.05599386601418438 %forward: 23.106827335402794 %backward: 61.9087407465162 [2025-03-31 17:19:25,546] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24591.68 | forward: 145639.88 | backward_microstep: 390213.19 | backward: 390204.22 | backward_inner_microstep: 390188.57 | backward_inner: 390182.48 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.19 | step: 352.92 | _step_clipping: 0.15 | _step_step: 351.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.246 | iteration 9310/ 143000 | elapsed time per iteration (ms): 63029.5 | learning rate: 5.955E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.550371E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 17:29:53,170] [INFO] [logging.py:60:log_dist] [Rank 0] step=9320, skipped=4, lr=[0.0005955089704058758, 0.0005955089704058758], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9320 loss: 2.5298 iter time (s): 62.762 samples/sec: 16.316 %comms: 0.002856414922333978 %optimizer_step 0.05652254535328606 %forward: 23.187553486267802 %backward: 62.17156948402693 [2025-03-31 17:29:53,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22036.35 | forward: 145529.54 | backward_microstep: 390208.21 | backward: 390200.70 | backward_inner_microstep: 390183.41 | backward_inner: 390177.50 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 354.75 | _step_clipping: 0.11 | _step_step: 352.96 | _step_zero_grad: 0.50 | _step_check_overflow: 0.61 samples/sec: 16.315 | iteration 9320/ 143000 | elapsed time per iteration (ms): 62762.5 | learning rate: 5.955E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.538447E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 17:40:26,658] [INFO] [logging.py:60:log_dist] [Rank 0] step=9330, skipped=4, lr=[0.000595497601898288, 0.000595497601898288], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9330 loss: 2.5436 iter time (s): 63.348 samples/sec: 16.165 %comms: 0.002826666343718014 %optimizer_step 0.05548711595601033 %forward: 22.982571770425604 %backward: 61.590563898324866 [2025-03-31 17:40:26,658] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27859.46 | forward: 145590.50 | backward_microstep: 390172.63 | backward: 390165.24 | backward_inner_microstep: 390147.94 | backward_inner: 390142.00 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 351.50 | _step_clipping: 0.13 | _step_step: 349.82 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.164 | iteration 9330/ 143000 | elapsed time per iteration (ms): 63348.8 | learning rate: 5.955E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.535849E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 17:50:50,062] [INFO] [logging.py:60:log_dist] [Rank 0] step=9340, skipped=4, lr=[0.0005954862191286609, 0.0005954862191286609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9340 loss: 2.5548 iter time (s): 62.340 samples/sec: 16.426 %comms: 0.00287311292927211 %optimizer_step 0.05668886939349634 %forward: 23.313790610974053 %backward: 62.586084404138866 [2025-03-31 17:50:50,063] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18034.53 | forward: 145337.94 | backward_microstep: 390168.34 | backward: 390161.03 | backward_inner_microstep: 390145.69 | backward_inner: 390139.58 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.53 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 353.40 | _step_clipping: 0.13 | _step_step: 351.79 | _step_zero_grad: 0.48 | _step_check_overflow: 0.46 samples/sec: 16.426 | iteration 9340/ 143000 | elapsed time per iteration (ms): 62340.4 | learning rate: 5.955E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.543214E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 18:01:17,092] [INFO] [logging.py:60:log_dist] [Rank 0] step=9350, skipped=4, lr=[0.0005954748220975439, 0.0005954748220975439], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9350 loss: 2.5347 iter time (s): 62.702 samples/sec: 16.331 %comms: 0.002885512735521322 %optimizer_step 0.05625656649779824 %forward: 23.197769648127032 %backward: 62.22247854916327 [2025-03-31 18:01:17,093] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21580.59 | forward: 145455.69 | backward_microstep: 390157.17 | backward: 390150.16 | backward_inner_microstep: 390134.63 | backward_inner: 390128.67 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 18.09 | reduce_grads: 0.18 | step: 352.74 | _step_clipping: 0.13 | _step_step: 351.06 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.331 | iteration 9350/ 143000 | elapsed time per iteration (ms): 62703.0 | learning rate: 5.955E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.541150E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 18:11:40,184] [INFO] [logging.py:60:log_dist] [Rank 0] step=9360, skipped=4, lr=[0.000595463410805487, 0.000595463410805487], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9360 loss: 2.5482 iter time (s): 62.309 samples/sec: 16.434 %comms: 0.0028940683592905215 %optimizer_step 0.05800013901048237 %forward: 23.307100050422697 %backward: 62.60852399534432 [2025-03-31 18:11:40,185] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17898.09 | forward: 145223.41 | backward_microstep: 390111.51 | backward: 390105.29 | backward_inner_microstep: 390088.65 | backward_inner: 390083.12 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.51 | reduce_tied_grads: 0.26 | comms: 18.03 | reduce_grads: 0.17 | step: 361.39 | _step_clipping: 0.13 | _step_step: 359.71 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.434 | iteration 9360/ 143000 | elapsed time per iteration (ms): 62309.2 | learning rate: 5.955E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.540411E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 18:22:06,284] [INFO] [logging.py:60:log_dist] [Rank 0] step=9370, skipped=4, lr=[0.000595451985253041, 0.000595451985253041], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9370 loss: 2.5499 iter time (s): 62.609 samples/sec: 16.355 %comms: 0.0031411676044734607 %optimizer_step 0.0579862403424195 %forward: 23.208589198217975 %backward: 62.32634251316844 [2025-03-31 18:22:06,284] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20642.46 | forward: 145307.66 | backward_microstep: 390228.66 | backward: 390221.69 | backward_inner_microstep: 390204.50 | backward_inner: 390196.92 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.55 | reduce_tied_grads: 3.62 | comms: 19.67 | reduce_grads: 0.19 | step: 363.05 | _step_clipping: 0.11 | _step_step: 361.31 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.355 | iteration 9370/ 143000 | elapsed time per iteration (ms): 62610.0 | learning rate: 5.955E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.537933E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 18:32:33,485] [INFO] [logging.py:60:log_dist] [Rank 0] step=9380, skipped=4, lr=[0.0005954405454407574, 0.0005954405454407574], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9380 loss: 2.5377 iter time (s): 62.720 samples/sec: 16.327 %comms: 0.002862375809030238 %optimizer_step 0.05710162050662427 %forward: 23.19586216391476 %backward: 62.22925253020362 [2025-03-31 18:32:33,485] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21509.10 | forward: 145483.31 | backward_microstep: 390307.15 | backward: 390298.82 | backward_inner_microstep: 390283.16 | backward_inner: 390277.11 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.18 | step: 358.14 | _step_clipping: 0.12 | _step_step: 356.50 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.327 | iteration 9380/ 143000 | elapsed time per iteration (ms): 62720.1 | learning rate: 5.954E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.541424E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 18:43:01,304] [INFO] [logging.py:60:log_dist] [Rank 0] step=9390, skipped=4, lr=[0.0005954290913691882, 0.0005954290913691882], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9390 loss: 2.5424 iter time (s): 62.781 samples/sec: 16.311 %comms: 0.002847740546814118 %optimizer_step 0.054997036780285294 %forward: 23.155020729329966 %backward: 62.15217991065958 [2025-03-31 18:43:01,305] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22372.08 | forward: 145370.61 | backward_microstep: 390208.17 | backward: 390200.49 | backward_inner_microstep: 390182.89 | backward_inner: 390177.06 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 345.28 | _step_clipping: 0.12 | _step_step: 343.69 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.310 | iteration 9390/ 143000 | elapsed time per iteration (ms): 62782.0 | learning rate: 5.954E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.539341E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 18:53:28,697] [INFO] [logging.py:60:log_dist] [Rank 0] step=9400, skipped=4, lr=[0.0005954176230388863, 0.0005954176230388863], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9400 loss: 2.5376 iter time (s): 62.739 samples/sec: 16.322 %comms: 0.0028670103263674224 %optimizer_step 0.055384670177412845 %forward: 23.198335316002993 %backward: 62.19722003059315 [2025-03-31 18:53:28,698] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21749.10 | forward: 145543.35 | backward_microstep: 390225.23 | backward: 390217.30 | backward_inner_microstep: 390200.38 | backward_inner: 390194.49 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.51 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.18 | step: 347.48 | _step_clipping: 0.11 | _step_step: 345.82 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.322 | iteration 9400/ 143000 | elapsed time per iteration (ms): 62739.3 | learning rate: 5.954E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.537085E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 19:03:54,648] [INFO] [logging.py:60:log_dist] [Rank 0] step=9410, skipped=4, lr=[0.0005954061404504053, 0.0005954061404504053], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9410 loss: 2.5495 iter time (s): 62.595 samples/sec: 16.359 %comms: 0.0028944881555417004 %optimizer_step 0.05692831098400245 %forward: 23.20635916061565 %backward: 62.32210018990108 [2025-03-31 19:03:54,649] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20689.57 | forward: 145259.05 | backward_microstep: 390110.17 | backward: 390102.09 | backward_inner_microstep: 390085.09 | backward_inner: 390079.18 | backward_allreduce_microstep: 9.02 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 18.12 | reduce_grads: 0.19 | step: 356.34 | _step_clipping: 0.14 | _step_step: 354.59 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.359 | iteration 9410/ 143000 | elapsed time per iteration (ms): 62595.1 | learning rate: 5.954E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.557976E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 19:14:18,025] [INFO] [logging.py:60:log_dist] [Rank 0] step=9420, skipped=4, lr=[0.0005953946436042992, 0.0005953946436042992], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9420 loss: 2.5306 iter time (s): 62.337 samples/sec: 16.427 %comms: 0.0028722466764907004 %optimizer_step 0.056269155115892505 %forward: 23.316242933979826 %backward: 62.58099440746058 [2025-03-31 19:14:18,025] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18077.15 | forward: 145346.74 | backward_microstep: 390119.24 | backward: 390111.89 | backward_inner_microstep: 390096.69 | backward_inner: 390090.97 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.53 | reduce_tied_grads: 0.26 | comms: 17.90 | reduce_grads: 0.19 | step: 350.77 | _step_clipping: 0.15 | _step_step: 349.05 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.427 | iteration 9420/ 143000 | elapsed time per iteration (ms): 62337.6 | learning rate: 5.954E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.546925E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 19:24:43,057] [INFO] [logging.py:60:log_dist] [Rank 0] step=9430, skipped=4, lr=[0.000595383132501123, 0.000595383132501123], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9430 loss: 2.5474 iter time (s): 62.503 samples/sec: 16.383 %comms: 0.0028694084596953555 %optimizer_step 0.0586750410787865 %forward: 23.27406527812125 %backward: 62.427166445299285 [2025-03-31 19:24:43,057] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19443.09 | forward: 145469.06 | backward_microstep: 390193.83 | backward: 390186.28 | backward_inner_microstep: 390170.41 | backward_inner: 390164.54 | backward_allreduce_microstep: 7.82 | backward_allreduce: 2.93 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.18 | step: 366.73 | _step_clipping: 0.12 | _step_step: 365.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.383 | iteration 9430/ 143000 | elapsed time per iteration (ms): 62503.2 | learning rate: 5.954E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.533150E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 19:35:07,797] [INFO] [logging.py:60:log_dist] [Rank 0] step=9440, skipped=4, lr=[0.0005953716071414324, 0.0005953716071414324], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9440 loss: 2.5388 iter time (s): 62.474 samples/sec: 16.391 %comms: 0.002899941555804906 %optimizer_step 0.05828360455255409 %forward: 23.236197198117136 %backward: 62.43540189177871 [2025-03-31 19:35:07,798] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19631.21 | forward: 145164.67 | backward_microstep: 390062.05 | backward: 390055.83 | backward_inner_microstep: 390041.00 | backward_inner: 390031.93 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 18.12 | reduce_grads: 0.19 | step: 364.12 | _step_clipping: 0.12 | _step_step: 362.37 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.391 | iteration 9440/ 143000 | elapsed time per iteration (ms): 62474.1 | learning rate: 5.954E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.547912E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 19:45:36,435] [INFO] [logging.py:60:log_dist] [Rank 0] step=9450, skipped=4, lr=[0.0005953600675257836, 0.0005953600675257836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9450 loss: 2.5464 iter time (s): 62.863 samples/sec: 16.289 %comms: 0.0028537108675453565 %optimizer_step 0.05605258863276631 %forward: 23.13883568932634 %backward: 62.08183778825129 [2025-03-31 19:45:36,435] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22998.22 | forward: 145458.04 | backward_microstep: 390274.28 | backward: 390266.07 | backward_inner_microstep: 390250.24 | backward_inner: 390240.90 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.20 | step: 352.36 | _step_clipping: 0.14 | _step_step: 350.68 | _step_zero_grad: 0.50 | _step_check_overflow: 0.48 samples/sec: 16.289 | iteration 9450/ 143000 | elapsed time per iteration (ms): 62863.7 | learning rate: 5.954E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.541784E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 19:56:02,065] [INFO] [logging.py:60:log_dist] [Rank 0] step=9460, skipped=4, lr=[0.0005953485136547334, 0.0005953485136547334], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9460 loss: 2.5415 iter time (s): 62.563 samples/sec: 16.368 %comms: 0.002863194124427493 %optimizer_step 0.055693950300535235 %forward: 23.213841626411575 %backward: 62.35162566927753 [2025-03-31 19:56:02,066] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20414.80 | forward: 145231.65 | backward_microstep: 390093.76 | backward: 390087.50 | backward_inner_microstep: 390071.78 | backward_inner: 390064.36 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.69 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.19 | step: 348.44 | _step_clipping: 0.12 | _step_step: 346.77 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.367 | iteration 9460/ 143000 | elapsed time per iteration (ms): 62563.1 | learning rate: 5.953E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.538983E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 20:06:23,633] [INFO] [logging.py:60:log_dist] [Rank 0] step=9470, skipped=4, lr=[0.0005953369455288396, 0.0005953369455288396], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9470 loss: 2.5452 iter time (s): 62.156 samples/sec: 16.475 %comms: 0.002866105286296373 %optimizer_step 0.057569092502405814 %forward: 23.35846794385632 %backward: 62.74667856083994 [2025-03-31 20:06:23,634] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16488.64 | forward: 145187.48 | backward_microstep: 390016.00 | backward: 390009.84 | backward_inner_microstep: 389994.85 | backward_inner: 389989.34 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.48 | reduce_tied_grads: 0.24 | comms: 17.81 | reduce_grads: 0.18 | step: 357.83 | _step_clipping: 0.13 | _step_step: 356.29 | _step_zero_grad: 0.45 | _step_check_overflow: 0.43 samples/sec: 16.474 | iteration 9470/ 143000 | elapsed time per iteration (ms): 62156.8 | learning rate: 5.953E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.540131E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 20:16:42,583] [INFO] [logging.py:60:log_dist] [Rank 0] step=9480, skipped=4, lr=[0.0005953253631486604, 0.0005953253631486604], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9480 loss: 2.5485 iter time (s): 61.894 samples/sec: 16.544 %comms: 0.0028877449947096816 %optimizer_step 0.05644702301762958 %forward: 23.47516687896281 %backward: 63.029463101673365 [2025-03-31 20:16:42,583] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13600.10 | forward: 145298.15 | backward_microstep: 390123.76 | backward: 390117.11 | backward_inner_microstep: 390101.97 | backward_inner: 390096.31 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.57 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 349.38 | _step_clipping: 0.12 | _step_step: 347.73 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.544 | iteration 9480/ 143000 | elapsed time per iteration (ms): 61895.0 | learning rate: 5.953E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.541535E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 20:27:06,984] [INFO] [logging.py:60:log_dist] [Rank 0] step=9490, skipped=4, lr=[0.000595313766514755, 0.000595313766514755], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9490 loss: 2.5394 iter time (s): 62.440 samples/sec: 16.400 %comms: 0.0028747499712683464 %optimizer_step 0.05925665103420294 %forward: 23.27228028844829 %backward: 62.48159280070813 [2025-03-31 20:27:06,985] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19006.87 | forward: 145311.15 | backward_microstep: 390139.15 | backward: 390132.47 | backward_inner_microstep: 390117.11 | backward_inner: 390111.43 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.18 | step: 370.00 | _step_clipping: 0.12 | _step_step: 368.28 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.400 | iteration 9490/ 143000 | elapsed time per iteration (ms): 62440.1 | learning rate: 5.953E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.543155E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 20:37:36,932] [INFO] [logging.py:60:log_dist] [Rank 0] step=9500, skipped=4, lr=[0.000595302155627683, 0.000595302155627683], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9500 loss: 2.5510 iter time (s): 62.994 samples/sec: 16.255 %comms: 0.00286033885847628 %optimizer_step 0.05608429050244403 %forward: 23.13941272142503 %backward: 61.959762907986196 [2025-03-31 20:37:36,933] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23909.06 | forward: 145764.95 | backward_microstep: 390318.99 | backward: 390310.75 | backward_inner_microstep: 390295.02 | backward_inner: 390287.21 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 18.02 | reduce_grads: 0.19 | step: 353.30 | _step_clipping: 0.15 | _step_step: 351.56 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.255 | iteration 9500/ 143000 | elapsed time per iteration (ms): 62994.8 | learning rate: 5.953E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.534540E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 20:48:01,443] [INFO] [logging.py:60:log_dist] [Rank 0] step=9510, skipped=4, lr=[0.0005952905304880047, 0.0005952905304880047], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9510 loss: 2.5341 iter time (s): 62.450 samples/sec: 16.397 %comms: 0.0028871514349421705 %optimizer_step 0.056334709633936546 %forward: 23.288548435838972 %backward: 62.50105815770344 [2025-03-31 20:48:01,443] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18768.45 | forward: 145438.15 | backward_microstep: 390330.65 | backward: 390322.23 | backward_inner_microstep: 390306.57 | backward_inner: 390300.44 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.56 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.20 | step: 351.81 | _step_clipping: 0.13 | _step_step: 350.10 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.397 | iteration 9510/ 143000 | elapsed time per iteration (ms): 62451.0 | learning rate: 5.953E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.545557E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 20:58:26,310] [INFO] [logging.py:60:log_dist] [Rank 0] step=9520, skipped=4, lr=[0.0005952788910962813, 0.0005952788910962813], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9520 loss: 2.5467 iter time (s): 62.486 samples/sec: 16.388 %comms: 0.002851199689620247 %optimizer_step 0.05707306162422856 %forward: 23.243910746839465 %backward: 62.42323966798191 [2025-03-31 20:58:26,310] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19642.33 | forward: 145242.39 | backward_microstep: 390065.16 | backward: 390059.18 | backward_inner_microstep: 390037.96 | backward_inner: 390030.70 | backward_allreduce_microstep: 11.86 | backward_allreduce: 5.11 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.18 | step: 356.63 | _step_clipping: 0.13 | _step_step: 354.99 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.387 | iteration 9520/ 143000 | elapsed time per iteration (ms): 62486.8 | learning rate: 5.953E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.548931E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 21:08:51,760] [INFO] [logging.py:60:log_dist] [Rank 0] step=9530, skipped=4, lr=[0.0005952672374530745, 0.0005952672374530745], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9530 loss: 2.5366 iter time (s): 62.544 samples/sec: 16.372 %comms: 0.0028684446727018763 %optimizer_step 0.056112805313353394 %forward: 23.265495815058802 %backward: 62.38777676710075 [2025-03-31 21:08:51,761] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19785.22 | forward: 145512.70 | backward_microstep: 390208.77 | backward: 390200.74 | backward_inner_microstep: 390184.80 | backward_inner: 390178.85 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 350.95 | _step_clipping: 0.13 | _step_step: 349.20 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.372 | iteration 9530/ 143000 | elapsed time per iteration (ms): 62545.0 | learning rate: 5.953E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.536635E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 21:19:19,196] [INFO] [logging.py:60:log_dist] [Rank 0] step=9540, skipped=4, lr=[0.0005952555695589469, 0.0005952555695589469], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9540 loss: 2.5612 iter time (s): 62.743 samples/sec: 16.321 %comms: 0.003140860613903421 %optimizer_step 0.05606164121654865 %forward: 23.15621895719539 %backward: 62.17272113940668 [2025-03-31 21:19:19,197] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22110.02 | forward: 145289.24 | backward_microstep: 390097.41 | backward: 390090.79 | backward_inner_microstep: 390075.50 | backward_inner: 390069.73 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.56 | reduce_tied_grads: 0.25 | comms: 19.71 | reduce_grads: 0.18 | step: 351.75 | _step_clipping: 0.12 | _step_step: 350.09 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.320 | iteration 9540/ 143000 | elapsed time per iteration (ms): 62743.6 | learning rate: 5.953E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.539237E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 21:29:45,093] [INFO] [logging.py:60:log_dist] [Rank 0] step=9550, skipped=4, lr=[0.0005952438874144614, 0.0005952438874144614], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9550 loss: 2.5445 iter time (s): 62.589 samples/sec: 16.361 %comms: 0.002859997649156594 %optimizer_step 0.05532758398407761 %forward: 23.203866708593644 %backward: 62.33858244977611 [2025-03-31 21:29:45,093] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20552.04 | forward: 145230.91 | backward_microstep: 390179.08 | backward: 390171.56 | backward_inner_microstep: 390156.36 | backward_inner: 390150.49 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 346.29 | _step_clipping: 0.11 | _step_step: 344.65 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.361 | iteration 9550/ 143000 | elapsed time per iteration (ms): 62589.6 | learning rate: 5.952E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.535969E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 21:40:07,130] [INFO] [logging.py:60:log_dist] [Rank 0] step=9560, skipped=4, lr=[0.0005952321910201822, 0.0005952321910201822], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9560 loss: 2.5440 iter time (s): 62.203 samples/sec: 16.462 %comms: 0.0028811881476876315 %optimizer_step 0.05553590768195749 %forward: 23.33901932383837 %backward: 62.701736596121414 [2025-03-31 21:40:07,131] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16922.84 | forward: 145176.27 | backward_microstep: 390031.26 | backward: 390025.13 | backward_inner_microstep: 390008.07 | backward_inner: 390002.40 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.20 | step: 345.45 | _step_clipping: 0.11 | _step_step: 343.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.462 | iteration 9560/ 143000 | elapsed time per iteration (ms): 62203.8 | learning rate: 5.952E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.542136E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 21:50:34,197] [INFO] [logging.py:60:log_dist] [Rank 0] step=9570, skipped=4, lr=[0.0005952204803766735, 0.0005952204803766735], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9570 loss: 2.5352 iter time (s): 62.706 samples/sec: 16.330 %comms: 0.0028692215465263293 %optimizer_step 0.0564831676950488 %forward: 23.159499355451164 %backward: 62.21894091363321 [2025-03-31 21:50:34,198] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21720.00 | forward: 145224.28 | backward_microstep: 390158.95 | backward: 390150.97 | backward_inner_microstep: 390134.54 | backward_inner: 390128.41 | backward_allreduce_microstep: 8.11 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.20 | step: 354.18 | _step_clipping: 0.12 | _step_step: 352.39 | _step_zero_grad: 0.49 | _step_check_overflow: 0.62 samples/sec: 16.330 | iteration 9570/ 143000 | elapsed time per iteration (ms): 62706.7 | learning rate: 5.952E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.532730E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 22:01:02,292] [INFO] [logging.py:60:log_dist] [Rank 0] step=9580, skipped=4, lr=[0.0005952087554845005, 0.0005952087554845005], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9580 loss: 2.5365 iter time (s): 62.809 samples/sec: 16.303 %comms: 0.0028517719704942274 %optimizer_step 0.0559203076306679 %forward: 23.174519910953123 %backward: 62.13335572570482 [2025-03-31 22:01:02,293] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22295.48 | forward: 145556.66 | backward_microstep: 390262.41 | backward: 390252.92 | backward_inner_microstep: 390237.15 | backward_inner: 390231.10 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 17.91 | reduce_grads: 0.19 | step: 351.23 | _step_clipping: 0.13 | _step_step: 349.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.303 | iteration 9580/ 143000 | elapsed time per iteration (ms): 62809.5 | learning rate: 5.952E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.538743E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 22:11:30,913] [INFO] [logging.py:60:log_dist] [Rank 0] step=9590, skipped=4, lr=[0.0005951970163442294, 0.0005951970163442294], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9590 loss: 2.5340 iter time (s): 62.861 samples/sec: 16.290 %comms: 0.002854544987732549 %optimizer_step 0.057035942661693956 %forward: 23.156692090960753 %backward: 62.08378058855566 [2025-03-31 22:11:30,913] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22781.01 | forward: 145566.44 | backward_microstep: 390275.96 | backward: 390267.95 | backward_inner_microstep: 390252.03 | backward_inner: 390246.00 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 358.54 | _step_clipping: 0.11 | _step_step: 356.93 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.290 | iteration 9590/ 143000 | elapsed time per iteration (ms): 62862.1 | learning rate: 5.952E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.532650E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 22:21:54,341] [INFO] [logging.py:60:log_dist] [Rank 0] step=9600, skipped=4, lr=[0.0005951852629564265, 0.0005951852629564265], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9600 loss: 2.5327 iter time (s): 62.342 samples/sec: 16.425 %comms: 0.002871358195347049 %optimizer_step 0.0582383079689981 %forward: 23.330396965058256 %backward: 62.59083973572764 [2025-03-31 22:21:54,342] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17768.43 | forward: 145447.04 | backward_microstep: 390212.62 | backward: 390205.65 | backward_inner_microstep: 390188.08 | backward_inner: 390182.25 | backward_allreduce_microstep: 9.39 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.90 | reduce_grads: 0.19 | step: 363.07 | _step_clipping: 0.12 | _step_step: 361.37 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.425 | iteration 9600/ 143000 | elapsed time per iteration (ms): 62342.8 | learning rate: 5.952E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.533437E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 22:32:17,196] [INFO] [logging.py:60:log_dist] [Rank 0] step=9610, skipped=4, lr=[0.0005951734953216591, 0.0005951734953216591], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9610 loss: 2.5262 iter time (s): 62.285 samples/sec: 16.441 %comms: 0.0028688341302733626 %optimizer_step 0.05561636083847661 %forward: 23.306285982557156 %backward: 62.62952991750406 [2025-03-31 22:32:17,197] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17629.66 | forward: 145163.08 | backward_microstep: 390095.62 | backward: 390087.71 | backward_inner_microstep: 390071.08 | backward_inner: 390065.29 | backward_allreduce_microstep: 8.94 | backward_allreduce: 2.49 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 346.41 | _step_clipping: 0.10 | _step_step: 344.77 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.440 | iteration 9610/ 143000 | elapsed time per iteration (ms): 62285.5 | learning rate: 5.952E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.532046E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 22:42:40,906] [INFO] [logging.py:60:log_dist] [Rank 0] step=9620, skipped=4, lr=[0.0005951617134404954, 0.0005951617134404954], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9620 loss: 2.5290 iter time (s): 62.371 samples/sec: 16.418 %comms: 0.002912184694945236 %optimizer_step 0.05639496139425622 %forward: 23.33060711515746 %backward: 62.56360607607178 [2025-03-31 22:42:40,907] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17979.60 | forward: 145514.17 | backward_microstep: 390219.76 | backward: 390212.36 | backward_inner_microstep: 390196.68 | backward_inner: 390190.63 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.58 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.19 | step: 351.74 | _step_clipping: 0.13 | _step_step: 349.97 | _step_zero_grad: 0.46 | _step_check_overflow: 0.61 samples/sec: 16.418 | iteration 9620/ 143000 | elapsed time per iteration (ms): 62371.1 | learning rate: 5.952E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.528186E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 22:53:01,721] [INFO] [logging.py:60:log_dist] [Rank 0] step=9630, skipped=4, lr=[0.0005951499173135036, 0.0005951499173135036], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9630 loss: 2.5205 iter time (s): 62.081 samples/sec: 16.495 %comms: 0.0028820286888896693 %optimizer_step 0.057614765941338325 %forward: 23.418160529789102 %backward: 62.86550737225235 [2025-03-31 22:53:01,721] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15115.85 | forward: 145381.97 | backward_microstep: 390282.74 | backward: 390274.52 | backward_inner_microstep: 390258.88 | backward_inner: 390252.76 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.53 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 357.68 | _step_clipping: 0.11 | _step_step: 356.00 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.494 | iteration 9630/ 143000 | elapsed time per iteration (ms): 62081.4 | learning rate: 5.951E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.527952E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 23:03:30,139] [INFO] [logging.py:60:log_dist] [Rank 0] step=9640, skipped=4, lr=[0.0005951381069412534, 0.0005951381069412534], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9640 loss: 2.5444 iter time (s): 62.841 samples/sec: 16.295 %comms: 0.0028657060332238794 %optimizer_step 0.05567087255158663 %forward: 23.12642805387051 %backward: 62.09652063735337 [2025-03-31 23:03:30,139] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22861.75 | forward: 145329.49 | backward_microstep: 390229.63 | backward: 390222.64 | backward_inner_microstep: 390205.28 | backward_inner: 390199.54 | backward_allreduce_microstep: 9.39 | backward_allreduce: 2.75 | reduce_tied_grads: 0.28 | comms: 18.01 | reduce_grads: 0.19 | step: 349.84 | _step_clipping: 0.12 | _step_step: 348.03 | _step_zero_grad: 0.65 | _step_check_overflow: 0.48 samples/sec: 16.295 | iteration 9640/ 143000 | elapsed time per iteration (ms): 62841.8 | learning rate: 5.951E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.534463E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 23:13:57,424] [INFO] [logging.py:60:log_dist] [Rank 0] step=9650, skipped=4, lr=[0.0005951262823243147, 0.0005951262823243147], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9650 loss: 2.5321 iter time (s): 62.728 samples/sec: 16.324 %comms: 0.002848911805598885 %optimizer_step 0.05548044320258892 %forward: 23.157218931740385 %backward: 62.191073945820804 [2025-03-31 23:13:57,425] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21928.24 | forward: 145260.69 | backward_microstep: 390119.17 | backward: 390112.40 | backward_inner_microstep: 390097.05 | backward_inner: 390091.34 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.52 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 348.02 | _step_clipping: 0.12 | _step_step: 346.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.324 | iteration 9650/ 143000 | elapsed time per iteration (ms): 62728.6 | learning rate: 5.951E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.539148E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 23:24:24,055] [INFO] [logging.py:60:log_dist] [Rank 0] step=9660, skipped=4, lr=[0.0005951144434632583, 0.0005951144434632583], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9660 loss: 2.5259 iter time (s): 62.662 samples/sec: 16.342 %comms: 0.0028757108611448564 %optimizer_step 0.05576176136026669 %forward: 23.200697646595028 %backward: 62.261808759025136 [2025-03-31 23:24:24,056] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21119.92 | forward: 145381.30 | backward_microstep: 390154.79 | backward: 390147.87 | backward_inner_microstep: 390132.72 | backward_inner: 390126.94 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.51 | reduce_tied_grads: 0.26 | comms: 18.02 | reduce_grads: 0.18 | step: 349.42 | _step_clipping: 0.14 | _step_step: 347.65 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.341 | iteration 9660/ 143000 | elapsed time per iteration (ms): 62663.1 | learning rate: 5.951E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.530449E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 23:34:52,258] [INFO] [logging.py:60:log_dist] [Rank 0] step=9670, skipped=4, lr=[0.0005951025903586555, 0.0005951025903586555], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9670 loss: 2.5271 iter time (s): 62.820 samples/sec: 16.301 %comms: 0.0028485849833232535 %optimizer_step 0.056825201877672336 %forward: 23.151648063537426 %backward: 62.11582677945998 [2025-03-31 23:34:52,259] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22574.67 | forward: 145438.12 | backward_microstep: 390217.32 | backward: 390210.18 | backward_inner_microstep: 390192.89 | backward_inner: 390187.22 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.52 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 356.97 | _step_clipping: 0.11 | _step_step: 355.34 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.300 | iteration 9670/ 143000 | elapsed time per iteration (ms): 62820.3 | learning rate: 5.951E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.533396E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 23:45:24,149] [INFO] [logging.py:60:log_dist] [Rank 0] step=9680, skipped=4, lr=[0.0005950907230110784, 0.0005950907230110784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9680 loss: 2.5492 iter time (s): 63.189 samples/sec: 16.205 %comms: 0.0028313938626213712 %optimizer_step 0.05535048437094869 %forward: 23.01387803281543 %backward: 61.75654307583468 [2025-03-31 23:45:24,150] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26235.66 | forward: 145421.37 | backward_microstep: 390239.36 | backward: 390230.67 | backward_inner_microstep: 390215.15 | backward_inner: 390209.31 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.57 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 349.75 | _step_clipping: 0.14 | _step_step: 348.07 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.205 | iteration 9680/ 143000 | elapsed time per iteration (ms): 63189.1 | learning rate: 5.951E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.542559E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-03-31 23:55:50,947] [INFO] [logging.py:60:log_dist] [Rank 0] step=9690, skipped=4, lr=[0.0005950788414210998, 0.0005950788414210998], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9690 loss: 2.5263 iter time (s): 62.679 samples/sec: 16.337 %comms: 0.0028671839342842248 %optimizer_step 0.055388491806316206 %forward: 23.188614755356614 %backward: 62.2465243765991 [2025-03-31 23:55:50,947] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21289.56 | forward: 145344.36 | backward_microstep: 390163.15 | backward: 390156.19 | backward_inner_microstep: 390140.72 | backward_inner: 390134.59 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.19 | step: 347.17 | _step_clipping: 0.12 | _step_step: 345.46 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.337 | iteration 9690/ 143000 | elapsed time per iteration (ms): 62679.7 | learning rate: 5.951E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.534377E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 00:06:22,359] [INFO] [logging.py:60:log_dist] [Rank 0] step=9700, skipped=4, lr=[0.0005950669455892932, 0.0005950669455892932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9700 loss: 2.5112 iter time (s): 63.141 samples/sec: 16.218 %comms: 0.0028452466087526213 %optimizer_step 0.056028487017911084 %forward: 23.031794692330372 %backward: 61.80176807213724 [2025-04-01 00:06:22,360] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25728.92 | forward: 145424.30 | backward_microstep: 390227.91 | backward: 390220.52 | backward_inner_microstep: 390202.74 | backward_inner: 390196.67 | backward_allreduce_microstep: 9.49 | backward_allreduce: 4.44 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.20 | step: 353.77 | _step_clipping: 0.11 | _step_step: 352.16 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.218 | iteration 9700/ 143000 | elapsed time per iteration (ms): 63141.2 | learning rate: 5.951E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.525143E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 00:16:48,597] [INFO] [logging.py:60:log_dist] [Rank 0] step=9710, skipped=4, lr=[0.0005950550355162325, 0.0005950550355162325], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9710 loss: 2.4939 iter time (s): 62.623 samples/sec: 16.352 %comms: 0.002866854738951906 %optimizer_step 0.055756842994205566 %forward: 23.198866510792527 %backward: 62.302288641764456 [2025-04-01 00:16:48,597] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20765.90 | forward: 145278.69 | backward_microstep: 390164.35 | backward: 390156.77 | backward_inner_microstep: 390141.43 | backward_inner: 390135.52 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.53 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.19 | step: 349.17 | _step_clipping: 0.13 | _step_step: 347.54 | _step_zero_grad: 0.45 | _step_check_overflow: 0.48 samples/sec: 16.352 | iteration 9710/ 143000 | elapsed time per iteration (ms): 62623.7 | learning rate: 5.951E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.522407E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 00:27:15,784] [INFO] [logging.py:60:log_dist] [Rank 0] step=9720, skipped=4, lr=[0.0005950431112024928, 0.0005950431112024928], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9720 loss: 2.5493 iter time (s): 62.718 samples/sec: 16.327 %comms: 0.0028454439500315153 %optimizer_step 0.056539912520389216 %forward: 23.163777136620272 %backward: 62.19897943742316 [2025-04-01 00:27:15,784] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21784.22 | forward: 145279.00 | backward_microstep: 390106.67 | backward: 390100.70 | backward_inner_microstep: 390085.58 | backward_inner: 390080.03 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.54 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 354.61 | _step_clipping: 0.13 | _step_step: 353.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.327 | iteration 9720/ 143000 | elapsed time per iteration (ms): 62718.7 | learning rate: 5.950E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.537057E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 00:37:45,042] [INFO] [logging.py:60:log_dist] [Rank 0] step=9730, skipped=4, lr=[0.0005950311726486496, 0.0005950311726486496], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9730 loss: 2.5148 iter time (s): 62.925 samples/sec: 16.273 %comms: 0.00284111750273194 %optimizer_step 0.055488083829944454 %forward: 23.10921995276603 %backward: 62.01212363206643 [2025-04-01 00:37:45,042] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23564.29 | forward: 145415.46 | backward_microstep: 390220.54 | backward: 390213.15 | backward_inner_microstep: 390197.79 | backward_inner: 390192.00 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.18 | step: 349.16 | _step_clipping: 0.14 | _step_step: 347.52 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.273 | iteration 9730/ 143000 | elapsed time per iteration (ms): 62925.8 | learning rate: 5.950E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.535907E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 00:48:06,725] [INFO] [logging.py:60:log_dist] [Rank 0] step=9740, skipped=4, lr=[0.000595019219855279, 0.000595019219855279], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9740 loss: 2.5335 iter time (s): 62.168 samples/sec: 16.472 %comms: 0.002875657538685806 %optimizer_step 0.05529716523165534 %forward: 23.35693409325691 %backward: 62.75982839395997 [2025-04-01 00:48:06,726] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16283.31 | forward: 145205.00 | backward_microstep: 390170.71 | backward: 390164.27 | backward_inner_microstep: 390149.10 | backward_inner: 390143.33 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.19 | step: 343.77 | _step_clipping: 0.12 | _step_step: 342.13 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.471 | iteration 9740/ 143000 | elapsed time per iteration (ms): 62168.4 | learning rate: 5.950E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.524646E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 00:58:36,023] [INFO] [logging.py:60:log_dist] [Rank 0] step=9750, skipped=4, lr=[0.0005950072528229579, 0.0005950072528229579], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9750 loss: 2.5214 iter time (s): 62.929 samples/sec: 16.272 %comms: 0.0028459803300204746 %optimizer_step 0.05596251036386196 %forward: 23.11370589524927 %backward: 61.998678474462935 [2025-04-01 00:58:36,023] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23643.64 | forward: 145452.70 | backward_microstep: 390159.67 | backward: 390152.73 | backward_inner_microstep: 390135.77 | backward_inner: 390130.06 | backward_allreduce_microstep: 9.12 | backward_allreduce: 4.34 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 352.17 | _step_clipping: 0.11 | _step_step: 350.45 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.272 | iteration 9750/ 143000 | elapsed time per iteration (ms): 62929.7 | learning rate: 5.950E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.529484E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 01:09:02,594] [INFO] [logging.py:60:log_dist] [Rank 0] step=9760, skipped=4, lr=[0.000594995271552264, 0.000594995271552264], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9760 loss: 2.5274 iter time (s): 62.657 samples/sec: 16.343 %comms: 0.0028603408844829646 %optimizer_step 0.0553849818393563 %forward: 23.17624295153283 %backward: 62.265311594156614 [2025-04-01 01:09:02,595] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21193.09 | forward: 145214.47 | backward_microstep: 390141.48 | backward: 390133.30 | backward_inner_microstep: 390118.26 | backward_inner: 390112.39 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.92 | reduce_grads: 0.18 | step: 347.02 | _step_clipping: 0.12 | _step_step: 345.16 | _step_zero_grad: 0.49 | _step_check_overflow: 0.71 samples/sec: 16.343 | iteration 9760/ 143000 | elapsed time per iteration (ms): 62657.1 | learning rate: 5.950E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.523697E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 01:19:29,000] [INFO] [logging.py:60:log_dist] [Rank 0] step=9770, skipped=4, lr=[0.0005949832760437754, 0.0005949832760437754], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9770 loss: 2.5153 iter time (s): 62.640 samples/sec: 16.347 %comms: 0.002851162251747852 %optimizer_step 0.05558009844459711 %forward: 23.18496632100606 %backward: 62.27915012867401 [2025-04-01 01:19:29,001] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21004.29 | forward: 145230.77 | backward_microstep: 390123.37 | backward: 390116.97 | backward_inner_microstep: 390101.73 | backward_inner: 390095.96 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.53 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.18 | step: 348.15 | _step_clipping: 0.13 | _step_step: 346.49 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.347 | iteration 9770/ 143000 | elapsed time per iteration (ms): 62640.6 | learning rate: 5.950E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.522826E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 01:29:56,644] [INFO] [logging.py:60:log_dist] [Rank 0] step=9780, skipped=4, lr=[0.0005949712662980713, 0.0005949712662980713], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9780 loss: 2.5418 iter time (s): 62.764 samples/sec: 16.315 %comms: 0.0028421218033370913 %optimizer_step 0.055466256370524346 %forward: 23.134267377758686 %backward: 62.13795122634238 [2025-04-01 01:29:56,645] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22443.84 | forward: 145199.48 | backward_microstep: 390007.45 | backward: 390001.46 | backward_inner_microstep: 389986.48 | backward_inner: 389980.95 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.58 | reduce_tied_grads: 0.24 | comms: 17.84 | reduce_grads: 0.18 | step: 348.13 | _step_clipping: 0.12 | _step_step: 346.46 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.315 | iteration 9780/ 143000 | elapsed time per iteration (ms): 62764.4 | learning rate: 5.950E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.541322E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 01:40:23,316] [INFO] [logging.py:60:log_dist] [Rank 0] step=9790, skipped=4, lr=[0.0005949592423157311, 0.0005949592423157311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9790 loss: 2.5544 iter time (s): 62.667 samples/sec: 16.340 %comms: 0.00285265542134935 %optimizer_step 0.055832911512162825 %forward: 23.169122721059697 %backward: 62.25424567577386 [2025-04-01 01:40:23,316] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21318.39 | forward: 145193.04 | backward_microstep: 390132.75 | backward: 390126.27 | backward_inner_microstep: 390110.97 | backward_inner: 390105.25 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.53 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 349.89 | _step_clipping: 0.12 | _step_step: 348.28 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.340 | iteration 9790/ 143000 | elapsed time per iteration (ms): 62667.2 | learning rate: 5.950E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.537227E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 01:50:49,112] [INFO] [logging.py:60:log_dist] [Rank 0] step=9800, skipped=4, lr=[0.0005949472040973352, 0.0005949472040973352], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9800 loss: 2.5339 iter time (s): 62.579 samples/sec: 16.363 %comms: 0.002861786173719399 %optimizer_step 0.05555675050108518 %forward: 23.210139489268165 %backward: 62.33941575407152 [2025-04-01 01:50:49,113] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20386.68 | forward: 145247.06 | backward_microstep: 390121.61 | backward: 390114.70 | backward_inner_microstep: 390099.59 | backward_inner: 390093.85 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.51 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 347.67 | _step_clipping: 0.13 | _step_step: 346.06 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.363 | iteration 9800/ 143000 | elapsed time per iteration (ms): 62579.7 | learning rate: 5.949E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.533326E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 02:01:06,886] [INFO] [logging.py:60:log_dist] [Rank 0] step=9810, skipped=4, lr=[0.0005949351516434648, 0.0005949351516434648], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9810 loss: 2.5462 iter time (s): 61.777 samples/sec: 16.576 %comms: 0.002888568581421384 %optimizer_step 0.0557679834037853 %forward: 23.494797385643434 %backward: 63.13198014265365 [2025-04-01 02:01:06,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12590.29 | forward: 145143.54 | backward_microstep: 390015.70 | backward: 390009.71 | backward_inner_microstep: 389994.81 | backward_inner: 389989.20 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.18 | step: 344.52 | _step_clipping: 0.11 | _step_step: 342.90 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.576 | iteration 9810/ 143000 | elapsed time per iteration (ms): 61777.4 | learning rate: 5.949E-04 | approx flops per GPU: 71.5TFLOPS | lm_loss: 2.534583E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 02:11:37,529] [INFO] [logging.py:60:log_dist] [Rank 0] step=9820, skipped=4, lr=[0.0005949230849547013, 0.0005949230849547013], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9820 loss: 2.5365 iter time (s): 63.064 samples/sec: 16.238 %comms: 0.002872044248221088 %optimizer_step 0.05729412217256737 %forward: 23.09244358096137 %backward: 61.87726269606236 [2025-04-01 02:11:37,530] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24730.55 | forward: 145629.57 | backward_microstep: 390229.64 | backward: 390221.11 | backward_inner_microstep: 390205.42 | backward_inner: 390199.40 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 18.11 | reduce_grads: 0.20 | step: 361.32 | _step_clipping: 0.12 | _step_step: 359.57 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.237 | iteration 9820/ 143000 | elapsed time per iteration (ms): 63064.3 | learning rate: 5.949E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.538072E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 02:22:07,335] [INFO] [logging.py:60:log_dist] [Rank 0] step=9830, skipped=4, lr=[0.0005949110040316273, 0.0005949110040316273], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9830 loss: 2.5148 iter time (s): 62.980 samples/sec: 16.259 %comms: 0.0028476221908948786 %optimizer_step 0.05551102957019725 %forward: 23.10295330224224 %backward: 61.94390652565643 [2025-04-01 02:22:07,337] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24120.96 | forward: 145502.38 | backward_microstep: 390130.04 | backward: 390122.68 | backward_inner_microstep: 390107.07 | backward_inner: 390099.07 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.74 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 349.61 | _step_clipping: 0.15 | _step_step: 347.93 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.259 | iteration 9830/ 143000 | elapsed time per iteration (ms): 62980.7 | learning rate: 5.949E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.527529E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 02:32:36,943] [INFO] [logging.py:60:log_dist] [Rank 0] step=9840, skipped=4, lr=[0.0005948989088748257, 0.0005948989088748257], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9840 loss: 2.5277 iter time (s): 62.960 samples/sec: 16.264 %comms: 0.002831102214153041 %optimizer_step 0.05527848062026221 %forward: 23.06521853835676 %backward: 61.954372401545555 [2025-04-01 02:32:36,944] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24293.42 | forward: 145218.86 | backward_microstep: 390071.95 | backward: 390065.39 | backward_inner_microstep: 390050.25 | backward_inner: 390044.56 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.18 | step: 348.03 | _step_clipping: 0.12 | _step_step: 346.44 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.264 | iteration 9840/ 143000 | elapsed time per iteration (ms): 62960.6 | learning rate: 5.949E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.524229E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 02:42:59,599] [INFO] [logging.py:60:log_dist] [Rank 0] step=9850, skipped=4, lr=[0.0005948867994848806, 0.0005948867994848806], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9850 loss: 2.5190 iter time (s): 62.265 samples/sec: 16.446 %comms: 0.0028998103130577677 %optimizer_step 0.05622414037693357 %forward: 23.31360475563224 %backward: 62.64404682135869 [2025-04-01 02:42:59,599] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17360.99 | forward: 145162.23 | backward_microstep: 390060.80 | backward: 390053.34 | backward_inner_microstep: 390037.94 | backward_inner: 390031.99 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.53 | reduce_tied_grads: 0.29 | comms: 18.06 | reduce_grads: 0.19 | step: 350.08 | _step_clipping: 0.12 | _step_step: 348.39 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.446 | iteration 9850/ 143000 | elapsed time per iteration (ms): 62265.6 | learning rate: 5.949E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.530378E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 02:53:26,039] [INFO] [logging.py:60:log_dist] [Rank 0] step=9860, skipped=4, lr=[0.0005948746758623762, 0.0005948746758623762], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9860 loss: 2.5270 iter time (s): 62.644 samples/sec: 16.346 %comms: 0.0028519555882761264 %optimizer_step 0.05519523992650134 %forward: 23.176746645915983 %backward: 62.26483999318475 [2025-04-01 02:53:26,040] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21145.72 | forward: 145187.33 | backward_microstep: 390057.16 | backward: 390048.96 | backward_inner_microstep: 390029.38 | backward_inner: 390023.64 | backward_allreduce_microstep: 11.54 | backward_allreduce: 4.87 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.17 | step: 345.76 | _step_clipping: 0.13 | _step_step: 344.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.60 samples/sec: 16.346 | iteration 9860/ 143000 | elapsed time per iteration (ms): 62644.1 | learning rate: 5.949E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.523806E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 03:03:58,222] [INFO] [logging.py:60:log_dist] [Rank 0] step=9870, skipped=4, lr=[0.0005948625380078976, 0.0005948625380078976], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9870 loss: 2.5258 iter time (s): 63.218 samples/sec: 16.198 %comms: 0.002833367132497748 %optimizer_step 0.05560087001960713 %forward: 22.99723929993958 %backward: 61.70356568437288 [2025-04-01 03:03:58,223] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26661.78 | forward: 145383.38 | backward_microstep: 390083.28 | backward: 390076.09 | backward_inner_microstep: 390061.09 | backward_inner: 390055.33 | backward_allreduce_microstep: 7.19 | backward_allreduce: 2.47 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 351.50 | _step_clipping: 0.11 | _step_step: 349.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.198 | iteration 9870/ 143000 | elapsed time per iteration (ms): 63218.3 | learning rate: 5.949E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.518040E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 03:14:27,201] [INFO] [logging.py:60:log_dist] [Rank 0] step=9880, skipped=4, lr=[0.0005948503859220308, 0.0005948503859220308], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9880 loss: 2.5335 iter time (s): 62.897 samples/sec: 16.281 %comms: 0.002840980919599831 %optimizer_step 0.05516767373068263 %forward: 23.127297393565392 %backward: 62.021612713447084 [2025-04-01 03:14:27,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23373.42 | forward: 145464.40 | backward_microstep: 390106.05 | backward: 390099.04 | backward_inner_microstep: 390083.48 | backward_inner: 390077.59 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.57 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.19 | step: 346.99 | _step_clipping: 0.14 | _step_step: 345.31 | _step_zero_grad: 0.51 | _step_check_overflow: 0.48 samples/sec: 16.280 | iteration 9880/ 143000 | elapsed time per iteration (ms): 62897.9 | learning rate: 5.949E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.538180E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 03:24:51,110] [INFO] [logging.py:60:log_dist] [Rank 0] step=9890, skipped=4, lr=[0.0005948382196053621, 0.0005948382196053621], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9890 loss: 2.5390 iter time (s): 62.390 samples/sec: 16.413 %comms: 0.0028737721316848233 %optimizer_step 0.0581126999415869 %forward: 23.28478585309993 %backward: 62.531945660885924 [2025-04-01 03:24:51,110] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18445.07 | forward: 145274.51 | backward_microstep: 390145.68 | backward: 390138.76 | backward_inner_microstep: 390123.11 | backward_inner: 390117.39 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.19 | step: 362.57 | _step_clipping: 0.14 | _step_step: 360.74 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.413 | iteration 9890/ 143000 | elapsed time per iteration (ms): 62390.9 | learning rate: 5.948E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.537043E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 03:35:22,512] [INFO] [logging.py:60:log_dist] [Rank 0] step=9900, skipped=4, lr=[0.0005948260390584789, 0.0005948260390584789], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9900 loss: 2.5351 iter time (s): 63.140 samples/sec: 16.218 %comms: 0.002833245278193234 %optimizer_step 0.055230459972277025 %forward: 23.010535367361122 %backward: 61.78682490716633 [2025-04-01 03:35:22,513] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25933.25 | forward: 145287.81 | backward_microstep: 390126.84 | backward: 390120.10 | backward_inner_microstep: 390104.75 | backward_inner: 390098.99 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.54 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 348.72 | _step_clipping: 0.14 | _step_step: 347.03 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.218 | iteration 9900/ 143000 | elapsed time per iteration (ms): 63140.2 | learning rate: 5.948E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.535427E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 03:45:51,614] [INFO] [logging.py:60:log_dist] [Rank 0] step=9910, skipped=4, lr=[0.000594813844281969, 0.000594813844281969], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9910 loss: 2.5355 iter time (s): 62.910 samples/sec: 16.277 %comms: 0.002849862700719555 %optimizer_step 0.0559772125953203 %forward: 23.14715663175879 %backward: 62.018783228773 [2025-04-01 03:45:51,614] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23226.86 | forward: 145617.75 | backward_microstep: 390165.17 | backward: 390157.46 | backward_inner_microstep: 390141.77 | backward_inner: 390135.67 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.60 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 352.15 | _step_clipping: 0.14 | _step_step: 350.46 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.277 | iteration 9910/ 143000 | elapsed time per iteration (ms): 62910.1 | learning rate: 5.948E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.529143E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 03:56:19,479] [INFO] [logging.py:60:log_dist] [Rank 0] step=9920, skipped=4, lr=[0.000594801635276421, 0.000594801635276421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9920 loss: 2.5295 iter time (s): 62.786 samples/sec: 16.309 %comms: 0.002852626479371743 %optimizer_step 0.05652154963401169 %forward: 23.1491197387523 %backward: 62.133766003985656 [2025-04-01 03:56:19,479] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22325.43 | forward: 145343.90 | backward_microstep: 390119.44 | backward: 390112.63 | backward_inner_microstep: 390097.22 | backward_inner: 390091.36 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 354.88 | _step_clipping: 0.12 | _step_step: 353.16 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.309 | iteration 9920/ 143000 | elapsed time per iteration (ms): 62786.5 | learning rate: 5.948E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.535210E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 04:06:46,945] [INFO] [logging.py:60:log_dist] [Rank 0] step=9930, skipped=4, lr=[0.0005947894120424241, 0.0005947894120424241], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9930 loss: 2.5301 iter time (s): 62.746 samples/sec: 16.320 %comms: 0.0028422039509106265 %optimizer_step 0.05541982326029832 %forward: 23.15823652686617 %backward: 62.1720160286259 [2025-04-01 04:06:46,945] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21976.86 | forward: 145308.81 | backward_microstep: 390111.82 | backward: 390104.90 | backward_inner_microstep: 390089.69 | backward_inner: 390083.87 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.51 | reduce_tied_grads: 0.27 | comms: 17.83 | reduce_grads: 0.19 | step: 347.74 | _step_clipping: 0.12 | _step_step: 346.15 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.320 | iteration 9930/ 143000 | elapsed time per iteration (ms): 62746.6 | learning rate: 5.948E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.528174E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 04:17:13,393] [INFO] [logging.py:60:log_dist] [Rank 0] step=9940, skipped=4, lr=[0.0005947771745805683, 0.0005947771745805683], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9940 loss: 2.5339 iter time (s): 62.644 samples/sec: 16.346 %comms: 0.002869352447008273 %optimizer_step 0.057244479512819156 %forward: 23.182595206821713 %backward: 62.27123139278344 [2025-04-01 04:17:13,394] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21045.95 | forward: 145225.71 | backward_microstep: 390100.52 | backward: 390093.68 | backward_inner_microstep: 390078.70 | backward_inner: 390073.02 | backward_allreduce_microstep: 7.20 | backward_allreduce: 2.47 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 358.60 | _step_clipping: 0.15 | _step_step: 356.82 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.346 | iteration 9940/ 143000 | elapsed time per iteration (ms): 62644.8 | learning rate: 5.948E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.533667E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 04:27:41,755] [INFO] [logging.py:60:log_dist] [Rank 0] step=9950, skipped=4, lr=[0.0005947649228914444, 0.0005947649228914444], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9950 loss: 2.5202 iter time (s): 62.836 samples/sec: 16.296 %comms: 0.0028520764199805497 %optimizer_step 0.05577403584557199 %forward: 23.1294535267492 %backward: 62.09211518091594 [2025-04-01 04:27:41,756] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22775.17 | forward: 145335.44 | backward_microstep: 390167.31 | backward: 390159.88 | backward_inner_microstep: 390144.47 | backward_inner: 390138.57 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.53 | reduce_tied_grads: 0.26 | comms: 17.92 | reduce_grads: 0.18 | step: 350.46 | _step_clipping: 0.12 | _step_step: 348.79 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.296 | iteration 9950/ 143000 | elapsed time per iteration (ms): 62836.2 | learning rate: 5.948E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.518060E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 04:38:08,397] [INFO] [logging.py:60:log_dist] [Rank 0] step=9960, skipped=4, lr=[0.0005947526569756434, 0.0005947526569756434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9960 loss: 2.5130 iter time (s): 62.664 samples/sec: 16.341 %comms: 0.0028507358440686107 %optimizer_step 0.056354557288835115 %forward: 23.191728899083163 %backward: 62.23465596473204 [2025-04-01 04:38:08,398] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21269.17 | forward: 145327.83 | backward_microstep: 389991.38 | backward: 389985.05 | backward_inner_microstep: 389970.01 | backward_inner: 389964.35 | backward_allreduce_microstep: 7.27 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.86 | reduce_grads: 0.18 | step: 353.14 | _step_clipping: 0.15 | _step_step: 351.44 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.341 | iteration 9960/ 143000 | elapsed time per iteration (ms): 62664.2 | learning rate: 5.948E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.519051E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 04:48:34,441] [INFO] [logging.py:60:log_dist] [Rank 0] step=9970, skipped=4, lr=[0.0005947403768337575, 0.0005947403768337575], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9970 loss: 2.5394 iter time (s): 62.604 samples/sec: 16.357 %comms: 0.0029093632524185616 %optimizer_step 0.056682198737724045 %forward: 23.195467652147528 %backward: 62.300279566676544 [2025-04-01 04:48:34,442] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20737.09 | forward: 145212.68 | backward_microstep: 390030.59 | backward: 390024.05 | backward_inner_microstep: 390008.89 | backward_inner: 390003.07 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.49 | reduce_tied_grads: 0.30 | comms: 18.21 | reduce_grads: 0.19 | step: 354.85 | _step_clipping: 0.12 | _step_step: 353.10 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.357 | iteration 9970/ 143000 | elapsed time per iteration (ms): 62604.5 | learning rate: 5.947E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.519223E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 04:59:01,727] [INFO] [logging.py:60:log_dist] [Rank 0] step=9980, skipped=4, lr=[0.0005947280824663794, 0.0005947280824663794], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9980 loss: 2.5215 iter time (s): 62.728 samples/sec: 16.324 %comms: 0.002878485824126643 %optimizer_step 0.05761042341765546 %forward: 23.16535885400672 %backward: 62.16809866981294 [2025-04-01 04:59:01,727] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21928.32 | forward: 145311.57 | backward_microstep: 389974.43 | backward: 389967.80 | backward_inner_microstep: 389952.68 | backward_inner: 389947.10 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.53 | reduce_tied_grads: 0.28 | comms: 18.06 | reduce_grads: 0.20 | step: 361.38 | _step_clipping: 0.13 | _step_step: 359.65 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.324 | iteration 9980/ 143000 | elapsed time per iteration (ms): 62728.5 | learning rate: 5.947E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.520202E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 05:09:28,533] [INFO] [logging.py:60:log_dist] [Rank 0] step=9990, skipped=4, lr=[0.0005947157738741024, 0.0005947157738741024], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 9990 loss: 2.5193 iter time (s): 62.680 samples/sec: 16.337 %comms: 0.0028924014617397673 %optimizer_step 0.055703519009897895 %forward: 23.15808482171072 %backward: 62.213231547448345 [2025-04-01 05:09:28,534] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21612.28 | forward: 145154.99 | backward_microstep: 389959.08 | backward: 389952.85 | backward_inner_microstep: 389937.78 | backward_inner: 389929.69 | backward_allreduce_microstep: 7.30 | backward_allreduce: 2.51 | reduce_tied_grads: 0.48 | comms: 18.13 | reduce_grads: 0.18 | step: 349.15 | _step_clipping: 0.15 | _step_step: 347.44 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.337 | iteration 9990/ 143000 | elapsed time per iteration (ms): 62680.7 | learning rate: 5.947E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.531694E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 05:19:55,165] [INFO] [logging.py:60:log_dist] [Rank 0] step=10000, skipped=4, lr=[0.0005947034510575206, 0.0005947034510575206], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10000 loss: 2.5174 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.002858813919565697 %optimizer_step 0.05550460067767036 %forward: 23.180562713793382 %backward: 62.244017282678996 [2025-04-01 05:19:55,165] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21283.53 | forward: 145255.32 | backward_microstep: 390045.22 | backward: 390036.88 | backward_inner_microstep: 390021.64 | backward_inner: 390015.96 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.53 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 347.81 | _step_clipping: 0.11 | _step_step: 346.15 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.341 | iteration 10000/ 143000 | elapsed time per iteration (ms): 62663.1 | learning rate: 5.947E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.530139E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 05:19:57,995] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step10000/mp_rank_00_model_states.pt [2025-04-01 05:20:11,966] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-01 05:20:11,972] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step10000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-01 05:30:39,968] [INFO] [logging.py:60:log_dist] [Rank 0] step=10010, skipped=4, lr=[0.0005946911140172288, 0.0005946911140172288], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10010 loss: 2.5198 iter time (s): 62.798 samples/sec: 16.306 %comms: 0.0028960751073185257 %optimizer_step 0.05701480105811811 %forward: 23.116257568896337 %backward: 62.10205025154444 [2025-04-01 05:30:39,969] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22747.00 | forward: 145165.76 | backward_microstep: 389995.68 | backward: 389989.21 | backward_inner_microstep: 389974.14 | backward_inner: 389968.45 | backward_allreduce_microstep: 7.26 | backward_allreduce: 2.48 | reduce_tied_grads: 0.31 | comms: 18.19 | reduce_grads: 0.19 | step: 358.04 | _step_clipping: 0.15 | _step_step: 356.21 | _step_zero_grad: 0.49 | _step_check_overflow: 0.61 samples/sec: 15.881 | iteration 10010/ 143000 | elapsed time per iteration (ms): 64480.3 | learning rate: 5.947E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.515962E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 05:41:07,487] [INFO] [logging.py:60:log_dist] [Rank 0] step=10020, skipped=4, lr=[0.0005946787627538223, 0.0005946787627538223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10020 loss: 2.5135 iter time (s): 62.751 samples/sec: 16.318 %comms: 0.00286723288552965 %optimizer_step 0.05674803466677836 %forward: 23.167552507698712 %backward: 62.16475366286731 [2025-04-01 05:41:07,487] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21942.48 | forward: 145379.40 | backward_microstep: 390098.99 | backward: 390091.89 | backward_inner_microstep: 390076.74 | backward_inner: 390071.05 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.19 | step: 356.10 | _step_clipping: 0.13 | _step_step: 354.37 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.318 | iteration 10020/ 143000 | elapsed time per iteration (ms): 62751.9 | learning rate: 5.947E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.513884E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 05:51:34,189] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 1048576.0 [2025-04-01 05:51:34,190] [INFO] [logging.py:60:log_dist] [Rank 0] step=10030, skipped=5, lr=[0.0005946676344564862, 0.0005946676344564862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10030 loss: 2.5352 iter time (s): 62.670 samples/sec: 16.340 %comms: 0.0025894790094804147 %optimizer_step 0.05249749640928349 %forward: 23.165192483829646 %backward: 62.230585839457284 [2025-04-01 05:51:34,190] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21454.17 | forward: 145175.67 | backward_microstep: 390004.02 | backward: 389997.50 | backward_inner_microstep: 389980.65 | backward_inner: 389975.09 | backward_allreduce_microstep: 9.13 | backward_allreduce: 2.65 | reduce_tied_grads: 0.26 | comms: 16.23 | reduce_grads: 0.18 | step: 329.00 | _step_clipping: 0.12 | _step_step: 327.53 | _step_zero_grad: 0.47 | _step_check_overflow: 0.34 samples/sec: 16.339 | iteration 10030/ 143000 | elapsed time per iteration (ms): 62670.3 | learning rate: 5.947E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.518739E+00 | loss scale: 1048576.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-01 05:52:35,935] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 524288.0 [2025-04-01 06:01:59,615] [INFO] [logging.py:60:log_dist] [Rank 0] step=10040, skipped=6, lr=[0.0005946564946393452, 0.0005946564946393452], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10040 loss: 2.5101 iter time (s): 62.542 samples/sec: 16.373 %comms: 0.002584400393018552 %optimizer_step 0.051033806970254045 %forward: 23.214283247288257 %backward: 62.355702900898976 [2025-04-01 06:01:59,615] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20169.35 | forward: 145186.70 | backward_microstep: 389990.97 | backward: 389984.85 | backward_inner_microstep: 389969.63 | backward_inner: 389963.95 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.56 | reduce_tied_grads: 0.27 | comms: 16.16 | reduce_grads: 0.19 | step: 319.18 | _step_clipping: 0.12 | _step_step: 317.53 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.373 | iteration 10040/ 143000 | elapsed time per iteration (ms): 62542.5 | learning rate: 5.947E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.525084E+00 | loss scale: 524288.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-01 06:12:27,715] [INFO] [logging.py:60:log_dist] [Rank 0] step=10050, skipped=6, lr=[0.0005946441035543915, 0.0005946441035543915], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10050 loss: 2.5020 iter time (s): 62.809 samples/sec: 16.303 %comms: 0.0028551649884345056 %optimizer_step 0.05646509445671876 %forward: 23.154338412875067 %backward: 62.09813649108844 [2025-04-01 06:12:27,715] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22514.84 | forward: 145431.09 | backward_microstep: 390041.90 | backward: 390034.89 | backward_inner_microstep: 390019.41 | backward_inner: 390013.48 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.18 | step: 354.65 | _step_clipping: 0.16 | _step_step: 352.87 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.303 | iteration 10050/ 143000 | elapsed time per iteration (ms): 62810.0 | learning rate: 5.946E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.514224E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 06:22:50,675] [INFO] [logging.py:60:log_dist] [Rank 0] step=10060, skipped=6, lr=[0.0005946316982485921, 0.0005946316982485921], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10060 loss: 2.5156 iter time (s): 62.295 samples/sec: 16.438 %comms: 0.0028722154346032753 %optimizer_step 0.055454574149317766 %forward: 23.29869514950838 %backward: 62.59897675456501 [2025-04-01 06:22:50,676] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17740.79 | forward: 145140.30 | backward_microstep: 389969.58 | backward: 389963.23 | backward_inner_microstep: 389948.10 | backward_inner: 389942.42 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.51 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 345.46 | _step_clipping: 0.13 | _step_step: 343.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.438 | iteration 10060/ 143000 | elapsed time per iteration (ms): 62296.0 | learning rate: 5.946E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.521451E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 06:33:17,336] [INFO] [logging.py:60:log_dist] [Rank 0] step=10070, skipped=6, lr=[0.0005946192787225458, 0.0005946192787225458], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10070 loss: 2.5233 iter time (s): 62.666 samples/sec: 16.341 %comms: 0.002858906923282986 %optimizer_step 0.05596689908461775 %forward: 23.162911564349308 %backward: 62.23620894772039 [2025-04-01 06:33:17,336] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21392.53 | forward: 145151.58 | backward_microstep: 390012.71 | backward: 390006.40 | backward_inner_microstep: 389991.49 | backward_inner: 389985.91 | backward_allreduce_microstep: 7.28 | backward_allreduce: 2.44 | reduce_tied_grads: 0.27 | comms: 17.92 | reduce_grads: 0.18 | step: 350.72 | _step_clipping: 0.12 | _step_step: 349.05 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.341 | iteration 10070/ 143000 | elapsed time per iteration (ms): 62666.1 | learning rate: 5.946E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.522086E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 06:43:46,054] [INFO] [logging.py:60:log_dist] [Rank 0] step=10080, skipped=6, lr=[0.000594606844976852, 0.000594606844976852], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10080 loss: 2.5102 iter time (s): 62.871 samples/sec: 16.287 %comms: 0.0028545197357574942 %optimizer_step 0.05607378917969027 %forward: 23.116901043515547 %backward: 62.04352748590334 [2025-04-01 06:43:46,054] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23195.35 | forward: 145338.83 | backward_microstep: 390082.17 | backward: 390075.36 | backward_inner_microstep: 390060.37 | backward_inner: 390054.54 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.48 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 352.54 | _step_clipping: 0.14 | _step_step: 350.74 | _step_zero_grad: 0.49 | _step_check_overflow: 0.62 samples/sec: 16.287 | iteration 10080/ 143000 | elapsed time per iteration (ms): 62871.8 | learning rate: 5.946E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.518501E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 06:54:14,334] [INFO] [logging.py:60:log_dist] [Rank 0] step=10090, skipped=6, lr=[0.0005945943970121107, 0.0005945943970121107], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10090 loss: 2.5064 iter time (s): 62.828 samples/sec: 16.299 %comms: 0.002849713748028085 %optimizer_step 0.05545794549669704 %forward: 23.12282048507713 %backward: 62.084648253883714 [2025-04-01 06:54:14,335] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22807.17 | forward: 145274.93 | backward_microstep: 390069.05 | backward: 390062.41 | backward_inner_microstep: 390047.16 | backward_inner: 390041.35 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.54 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 348.43 | _step_clipping: 0.11 | _step_step: 346.75 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.298 | iteration 10090/ 143000 | elapsed time per iteration (ms): 62828.1 | learning rate: 5.946E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.506462E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 07:04:41,037] [INFO] [logging.py:60:log_dist] [Rank 0] step=10100, skipped=6, lr=[0.000594581934828923, 0.000594581934828923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10100 loss: 2.5013 iter time (s): 62.670 samples/sec: 16.340 %comms: 0.002847833959297182 %optimizer_step 0.05561225442556582 %forward: 23.165893372672848 %backward: 62.23281692136594 [2025-04-01 07:04:41,038] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21387.34 | forward: 145180.04 | backward_microstep: 390019.47 | backward: 390011.41 | backward_inner_microstep: 389996.53 | backward_inner: 389990.95 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 348.52 | _step_clipping: 0.12 | _step_step: 346.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.339 | iteration 10100/ 143000 | elapsed time per iteration (ms): 62670.3 | learning rate: 5.946E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.511633E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 07:15:08,686] [INFO] [logging.py:60:log_dist] [Rank 0] step=10110, skipped=6, lr=[0.00059456945842789, 0.00059456945842789], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10110 loss: 2.5252 iter time (s): 62.764 samples/sec: 16.315 %comms: 0.0028761338068750866 %optimizer_step 0.05673404226093125 %forward: 23.13932966813362 %backward: 62.15128307776383 [2025-04-01 07:15:08,687] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22161.03 | forward: 145232.47 | backward_microstep: 390095.68 | backward: 390088.40 | backward_inner_microstep: 390072.83 | backward_inner: 390066.92 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.57 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.22 | step: 356.09 | _step_clipping: 0.14 | _step_step: 354.31 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.315 | iteration 10110/ 143000 | elapsed time per iteration (ms): 62764.9 | learning rate: 5.946E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.516714E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 07:25:36,486] [INFO] [logging.py:60:log_dist] [Rank 0] step=10120, skipped=6, lr=[0.0005945569678096141, 0.0005945569678096141], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10120 loss: 2.5015 iter time (s): 62.779 samples/sec: 16.311 %comms: 0.00286291181854789 %optimizer_step 0.05839481824938854 %forward: 23.154131386223455 %backward: 62.1396269919806 [2025-04-01 07:25:36,486] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22168.87 | forward: 145360.21 | backward_microstep: 390116.18 | backward: 390108.76 | backward_inner_microstep: 390093.41 | backward_inner: 390085.88 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.53 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.19 | step: 366.60 | _step_clipping: 0.15 | _step_step: 364.78 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 16.311 | iteration 10120/ 143000 | elapsed time per iteration (ms): 62780.0 | learning rate: 5.946E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.521368E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 07:36:02,202] [INFO] [logging.py:60:log_dist] [Rank 0] step=10130, skipped=6, lr=[0.0005945444629746982, 0.0005945444629746982], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10130 loss: 2.5271 iter time (s): 62.571 samples/sec: 16.365 %comms: 0.0028655811898972423 %optimizer_step 0.05709290913761628 %forward: 23.246092245454076 %backward: 62.37232339756438 [2025-04-01 07:36:02,204] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19815.85 | forward: 145453.46 | backward_microstep: 390280.57 | backward: 390270.76 | backward_inner_microstep: 390254.15 | backward_inner: 390247.73 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.68 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.20 | step: 357.24 | _step_clipping: 0.11 | _step_step: 355.56 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.365 | iteration 10130/ 143000 | elapsed time per iteration (ms): 62571.7 | learning rate: 5.945E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.518531E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 07:46:27,374] [INFO] [logging.py:60:log_dist] [Rank 0] step=10140, skipped=6, lr=[0.0005945319439237456, 0.0005945319439237456], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10140 loss: 2.5148 iter time (s): 62.516 samples/sec: 16.380 %comms: 0.002957937117835643 %optimizer_step 0.058276902514837255 %forward: 23.281414803843308 %backward: 62.51082450969204 [2025-04-01 07:46:27,374] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18586.37 | forward: 145547.22 | backward_microstep: 390811.46 | backward: 390795.71 | backward_inner_microstep: 390778.51 | backward_inner: 390769.43 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.73 | reduce_tied_grads: 0.35 | comms: 18.49 | reduce_grads: 0.23 | step: 364.33 | _step_clipping: 0.12 | _step_step: 362.36 | _step_zero_grad: 0.60 | _step_check_overflow: 0.60 samples/sec: 16.380 | iteration 10140/ 143000 | elapsed time per iteration (ms): 62517.1 | learning rate: 5.945E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.515611E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 07:56:55,649] [INFO] [logging.py:60:log_dist] [Rank 0] step=10150, skipped=6, lr=[0.0005945194106573607, 0.0005945194106573607], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10150 loss: 2.5201 iter time (s): 62.827 samples/sec: 16.299 %comms: 0.0028623009415673396 %optimizer_step 0.056660246321416086 %forward: 23.158053702164214 %backward: 62.138703529641894 [2025-04-01 07:56:55,650] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22197.04 | forward: 145494.96 | backward_microstep: 390406.74 | backward: 390398.43 | backward_inner_microstep: 390382.63 | backward_inner: 390376.41 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.18 | step: 355.98 | _step_clipping: 0.16 | _step_step: 354.27 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.299 | iteration 10150/ 143000 | elapsed time per iteration (ms): 62827.5 | learning rate: 5.945E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.513705E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 08:07:21,406] [INFO] [logging.py:60:log_dist] [Rank 0] step=10160, skipped=6, lr=[0.0005945068631761484, 0.0005945068631761484], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10160 loss: 2.5389 iter time (s): 62.575 samples/sec: 16.364 %comms: 0.0028871950017790903 %optimizer_step 0.057860184912858936 %forward: 23.252424533214022 %backward: 62.40765572904675 [2025-04-01 08:07:21,406] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19520.28 | forward: 145502.22 | backward_microstep: 390525.67 | backward: 390516.38 | backward_inner_microstep: 390500.49 | backward_inner: 390494.29 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.20 | step: 362.06 | _step_clipping: 0.12 | _step_step: 360.19 | _step_zero_grad: 0.53 | _step_check_overflow: 0.65 samples/sec: 16.364 | iteration 10160/ 143000 | elapsed time per iteration (ms): 62575.7 | learning rate: 5.945E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.519111E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 08:17:48,781] [INFO] [logging.py:60:log_dist] [Rank 0] step=10170, skipped=6, lr=[0.0005944943014807142, 0.0005944943014807142], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10170 loss: 2.5190 iter time (s): 62.737 samples/sec: 16.322 %comms: 0.002891413495921259 %optimizer_step 0.05670965422718337 %forward: 23.168797307230456 %backward: 62.205359116117585 [2025-04-01 08:17:48,781] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21569.62 | forward: 145353.91 | backward_microstep: 390264.86 | backward: 390257.30 | backward_inner_microstep: 390241.72 | backward_inner: 390235.68 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.57 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.18 | step: 355.78 | _step_clipping: 0.13 | _step_step: 353.92 | _step_zero_grad: 0.52 | _step_check_overflow: 0.63 samples/sec: 16.322 | iteration 10170/ 143000 | elapsed time per iteration (ms): 62737.5 | learning rate: 5.945E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.516788E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 08:28:19,350] [INFO] [logging.py:60:log_dist] [Rank 0] step=10180, skipped=6, lr=[0.0005944817255716646, 0.0005944817255716646], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10180 loss: 2.5054 iter time (s): 63.056 samples/sec: 16.239 %comms: 0.0028489394212177204 %optimizer_step 0.056725647671779 %forward: 23.097864342366766 %backward: 61.94585274988601 [2025-04-01 08:28:19,350] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24048.41 | forward: 145646.63 | backward_microstep: 390619.64 | backward: 390607.74 | backward_inner_microstep: 390586.84 | backward_inner: 390579.93 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.77 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.21 | step: 357.69 | _step_clipping: 0.16 | _step_step: 355.92 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.239 | iteration 10180/ 143000 | elapsed time per iteration (ms): 63056.9 | learning rate: 5.945E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.512881E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 08:38:49,887] [INFO] [logging.py:60:log_dist] [Rank 0] step=10190, skipped=6, lr=[0.0005944691354496063, 0.0005944691354496063], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10190 loss: 2.5214 iter time (s): 63.053 samples/sec: 16.240 %comms: 0.002855434855382608 %optimizer_step 0.05652763524761369 %forward: 23.071144415546996 %backward: 61.91932945961987 [2025-04-01 08:38:49,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24429.84 | forward: 145470.84 | backward_microstep: 390429.94 | backward: 390420.90 | backward_inner_microstep: 390403.88 | backward_inner: 390397.64 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.20 | step: 356.42 | _step_clipping: 0.12 | _step_step: 354.72 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.240 | iteration 10190/ 143000 | elapsed time per iteration (ms): 63053.7 | learning rate: 5.945E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.516591E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 08:49:16,339] [INFO] [logging.py:60:log_dist] [Rank 0] step=10200, skipped=6, lr=[0.0005944565311151472, 0.0005944565311151472], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10200 loss: 2.4950 iter time (s): 62.645 samples/sec: 16.346 %comms: 0.002875726571909185 %optimizer_step 0.05662342156583482 %forward: 23.336810187995308 %backward: 62.40086882687342 [2025-04-01 08:49:16,340] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19034.70 | forward: 146192.79 | backward_microstep: 390925.17 | backward: 390908.48 | backward_inner_microstep: 390891.54 | backward_inner: 390880.46 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.68 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.19 | step: 354.72 | _step_clipping: 0.14 | _step_step: 352.96 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.346 | iteration 10200/ 143000 | elapsed time per iteration (ms): 62645.3 | learning rate: 5.945E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.508057E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 08:59:43,634] [INFO] [logging.py:60:log_dist] [Rank 0] step=10210, skipped=6, lr=[0.0005944439125688954, 0.0005944439125688954], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10210 loss: 2.5310 iter time (s): 62.729 samples/sec: 16.324 %comms: 0.00288227951763115 %optimizer_step 0.056984901679270915 %forward: 23.21306860576437 %backward: 62.275829032474185 [2025-04-01 08:59:43,636] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20753.42 | forward: 145613.12 | backward_microstep: 390661.91 | backward: 390649.69 | backward_inner_microstep: 390633.18 | backward_inner: 390626.63 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.19 | step: 357.46 | _step_clipping: 0.12 | _step_step: 355.63 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 16.324 | iteration 10210/ 143000 | elapsed time per iteration (ms): 62729.6 | learning rate: 5.944E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.513596E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 09:10:15,809] [INFO] [logging.py:60:log_dist] [Rank 0] step=10220, skipped=6, lr=[0.0005944312798114601, 0.0005944312798114601], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10220 loss: 2.5106 iter time (s): 63.217 samples/sec: 16.198 %comms: 0.0028381605372488927 %optimizer_step 0.056486764087816305 %forward: 23.023361219426615 %backward: 61.766912588232294 [2025-04-01 09:10:15,810] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25893.45 | forward: 145546.40 | backward_microstep: 390480.80 | backward: 390470.85 | backward_inner_microstep: 390453.77 | backward_inner: 390447.35 | backward_allreduce_microstep: 8.34 | backward_allreduce: 3.01 | reduce_tied_grads: 0.31 | comms: 17.94 | reduce_grads: 0.19 | step: 357.09 | _step_clipping: 0.13 | _step_step: 355.43 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.198 | iteration 10220/ 143000 | elapsed time per iteration (ms): 63217.4 | learning rate: 5.944E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.507732E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 09:20:43,599] [INFO] [logging.py:60:log_dist] [Rank 0] step=10230, skipped=6, lr=[0.000594418632843451, 0.000594418632843451], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10230 loss: 2.5201 iter time (s): 62.778 samples/sec: 16.311 %comms: 0.0028767826754665564 %optimizer_step 0.058667886567241916 %forward: 23.24018502261208 %backward: 62.23491232267552 [2025-04-01 09:20:43,599] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20885.25 | forward: 145898.06 | backward_microstep: 390715.74 | backward: 390700.55 | backward_inner_microstep: 390683.57 | backward_inner: 390677.04 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.19 | step: 368.31 | _step_clipping: 0.14 | _step_step: 366.47 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.311 | iteration 10230/ 143000 | elapsed time per iteration (ms): 62778.9 | learning rate: 5.944E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.526957E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 09:31:08,796] [INFO] [logging.py:60:log_dist] [Rank 0] step=10240, skipped=6, lr=[0.0005944059716654784, 0.0005944059716654784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10240 loss: 2.5035 iter time (s): 62.519 samples/sec: 16.379 %comms: 0.002900376773262635 %optimizer_step 0.05856244155200234 %forward: 23.322447853776865 %backward: 62.492293513385434 [2025-04-01 09:31:08,797] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18371.47 | forward: 145810.09 | backward_microstep: 390712.49 | backward: 390696.85 | backward_inner_microstep: 390678.56 | backward_inner: 390672.24 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.68 | reduce_tied_grads: 0.28 | comms: 18.13 | reduce_grads: 0.19 | step: 366.13 | _step_clipping: 0.11 | _step_step: 364.37 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.379 | iteration 10240/ 143000 | elapsed time per iteration (ms): 62519.8 | learning rate: 5.944E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.512428E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 09:41:38,298] [INFO] [logging.py:60:log_dist] [Rank 0] step=10250, skipped=6, lr=[0.0005943932962781535, 0.0005943932962781535], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10250 loss: 2.5192 iter time (s): 62.950 samples/sec: 16.267 %comms: 0.0028995606078869333 %optimizer_step 0.05623936351741574 %forward: 23.116031439642107 %backward: 62.041610329009266 [2025-04-01 09:41:38,298] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23178.42 | forward: 145514.44 | backward_microstep: 390560.05 | backward: 390549.31 | backward_inner_microstep: 390532.75 | backward_inner: 390526.32 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.25 | reduce_grads: 0.22 | step: 354.02 | _step_clipping: 0.13 | _step_step: 352.16 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.267 | iteration 10250/ 143000 | elapsed time per iteration (ms): 62950.2 | learning rate: 5.944E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.514605E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 09:52:11,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=10260, skipped=6, lr=[0.0005943806066820881, 0.0005943806066820881], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10260 loss: 2.5083 iter time (s): 63.291 samples/sec: 16.179 %comms: 0.002835265527393252 %optimizer_step 0.05550916112527396 %forward: 22.972636302595085 %backward: 61.67492712259141 [2025-04-01 09:52:11,210] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26950.81 | forward: 145395.26 | backward_microstep: 390351.75 | backward: 390344.50 | backward_inner_microstep: 390328.51 | backward_inner: 390322.40 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.62 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.19 | step: 351.32 | _step_clipping: 0.12 | _step_step: 349.61 | _step_zero_grad: 0.46 | _step_check_overflow: 0.58 samples/sec: 16.179 | iteration 10260/ 143000 | elapsed time per iteration (ms): 63291.2 | learning rate: 5.944E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.518184E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 10:02:37,875] [INFO] [logging.py:60:log_dist] [Rank 0] step=10270, skipped=6, lr=[0.0005943679028778944, 0.0005943679028778944], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10270 loss: 2.5033 iter time (s): 62.666 samples/sec: 16.341 %comms: 0.0028586177417567913 %optimizer_step 0.05623581663332731 %forward: 23.239757835737798 %backward: 62.30237675715683 [2025-04-01 10:02:37,876] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20377.95 | forward: 145634.30 | backward_microstep: 390432.26 | backward: 390424.17 | backward_inner_microstep: 390408.13 | backward_inner: 390401.99 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.19 | step: 352.41 | _step_clipping: 0.11 | _step_step: 350.56 | _step_zero_grad: 0.50 | _step_check_overflow: 0.68 samples/sec: 16.340 | iteration 10270/ 143000 | elapsed time per iteration (ms): 62666.6 | learning rate: 5.944E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.509268E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 10:13:06,697] [INFO] [logging.py:60:log_dist] [Rank 0] step=10280, skipped=6, lr=[0.0005943551848661858, 0.0005943551848661858], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10280 loss: 2.5083 iter time (s): 62.882 samples/sec: 16.285 %comms: 0.002860309044784452 %optimizer_step 0.05579310731584212 %forward: 23.125870746781846 %backward: 62.0777043834427 [2025-04-01 10:13:06,697] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22831.21 | forward: 145419.00 | backward_microstep: 390361.50 | backward: 390354.07 | backward_inner_microstep: 390338.17 | backward_inner: 390332.30 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.62 | reduce_tied_grads: 0.26 | comms: 17.99 | reduce_grads: 0.19 | step: 350.84 | _step_clipping: 0.12 | _step_step: 349.25 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.284 | iteration 10280/ 143000 | elapsed time per iteration (ms): 62882.1 | learning rate: 5.944E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.507534E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 10:23:28,563] [INFO] [logging.py:60:log_dist] [Rank 0] step=10290, skipped=6, lr=[0.0005943424526475761, 0.0005943424526475761], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10290 loss: 2.5063 iter time (s): 62.186 samples/sec: 16.467 %comms: 0.002948390577246444 %optimizer_step 0.05630562595141687 %forward: 23.412979256226237 %backward: 62.80871171808593 [2025-04-01 10:23:28,563] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15405.16 | forward: 145595.98 | backward_microstep: 390591.78 | backward: 390582.34 | backward_inner_microstep: 390566.37 | backward_inner: 390560.19 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 18.33 | reduce_grads: 0.19 | step: 350.14 | _step_clipping: 0.12 | _step_step: 348.18 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.467 | iteration 10290/ 143000 | elapsed time per iteration (ms): 62186.6 | learning rate: 5.943E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.508427E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 10:33:58,477] [INFO] [logging.py:60:log_dist] [Rank 0] step=10300, skipped=6, lr=[0.0005943297062226797, 0.0005943297062226797], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10300 loss: 2.5316 iter time (s): 62.991 samples/sec: 16.256 %comms: 0.0028464094805920707 %optimizer_step 0.0550927829321495 %forward: 23.13820740949142 %backward: 61.97898920587952 [2025-04-01 10:33:58,478] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23502.88 | forward: 145749.67 | backward_microstep: 390420.10 | backward: 390411.29 | backward_inner_microstep: 390395.35 | backward_inner: 390389.25 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.19 | step: 347.03 | _step_clipping: 0.12 | _step_step: 345.41 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.256 | iteration 10300/ 143000 | elapsed time per iteration (ms): 62991.5 | learning rate: 5.943E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.508867E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 10:44:25,963] [INFO] [logging.py:60:log_dist] [Rank 0] step=10310, skipped=6, lr=[0.0005943169455921119, 0.0005943169455921119], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10310 loss: 2.4906 iter time (s): 62.748 samples/sec: 16.319 %comms: 0.0028620617527167686 %optimizer_step 0.05574202004501648 %forward: 23.200237292789495 %backward: 62.215254749094015 [2025-04-01 10:44:25,964] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21254.70 | forward: 145576.97 | backward_microstep: 390397.18 | backward: 390388.59 | backward_inner_microstep: 390372.70 | backward_inner: 390366.59 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.66 | reduce_tied_grads: 0.27 | comms: 17.96 | reduce_grads: 0.19 | step: 349.77 | _step_clipping: 0.12 | _step_step: 347.82 | _step_zero_grad: 0.49 | _step_check_overflow: 0.59 samples/sec: 16.319 | iteration 10310/ 143000 | elapsed time per iteration (ms): 62748.6 | learning rate: 5.943E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.507058E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 10:54:54,838] [INFO] [logging.py:60:log_dist] [Rank 0] step=10320, skipped=6, lr=[0.0005943041707564885, 0.0005943041707564885], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10320 loss: 2.5018 iter time (s): 62.887 samples/sec: 16.283 %comms: 0.0028531983542608736 %optimizer_step 0.05596473998532856 %forward: 23.13678191586 %backward: 62.07309561769213 [2025-04-01 10:54:54,840] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22750.67 | forward: 145500.26 | backward_microstep: 390366.53 | backward: 390359.01 | backward_inner_microstep: 390343.21 | backward_inner: 390337.18 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.18 | step: 351.95 | _step_clipping: 0.14 | _step_step: 349.03 | _step_zero_grad: 0.46 | _step_check_overflow: 1.76 samples/sec: 16.283 | iteration 10320/ 143000 | elapsed time per iteration (ms): 62887.6 | learning rate: 5.943E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.500390E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 11:05:23,058] [INFO] [logging.py:60:log_dist] [Rank 0] step=10330, skipped=6, lr=[0.0005942913817164262, 0.0005942913817164262], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10330 loss: 2.5042 iter time (s): 62.821 samples/sec: 16.300 %comms: 0.002873830889355849 %optimizer_step 0.05856033397698738 %forward: 23.13480319043572 %backward: 62.16146640351591 [2025-04-01 11:05:23,059] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22086.63 | forward: 145335.77 | backward_microstep: 390515.14 | backward: 390506.21 | backward_inner_microstep: 390489.27 | backward_inner: 390482.85 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.79 | reduce_tied_grads: 0.35 | comms: 18.05 | reduce_grads: 0.23 | step: 367.88 | _step_clipping: 0.14 | _step_step: 366.00 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.300 | iteration 10330/ 143000 | elapsed time per iteration (ms): 62821.9 | learning rate: 5.943E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.506446E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 11:15:50,965] [INFO] [logging.py:60:log_dist] [Rank 0] step=10340, skipped=6, lr=[0.0005942785784725421, 0.0005942785784725421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10340 loss: 2.5035 iter time (s): 62.790 samples/sec: 16.308 %comms: 0.0028758668035464697 %optimizer_step 0.05797142829691088 %forward: 23.19805364080781 %backward: 62.2007685632621 [2025-04-01 11:15:50,966] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21378.05 | forward: 145660.72 | backward_microstep: 390567.16 | backward: 390559.00 | backward_inner_microstep: 390542.71 | backward_inner: 390536.60 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.20 | step: 364.00 | _step_clipping: 0.13 | _step_step: 362.16 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.308 | iteration 10340/ 143000 | elapsed time per iteration (ms): 62790.7 | learning rate: 5.943E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.505558E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 11:26:18,664] [INFO] [logging.py:60:log_dist] [Rank 0] step=10350, skipped=6, lr=[0.0005942657610254543, 0.0005942657610254543], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10350 loss: 2.4954 iter time (s): 62.769 samples/sec: 16.314 %comms: 0.0029381210024627593 %optimizer_step 0.05739407222675676 %forward: 23.22752010596449 %backward: 62.24168054902853 [2025-04-01 11:26:18,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20906.05 | forward: 145797.62 | backward_microstep: 390697.62 | backward: 390686.94 | backward_inner_microstep: 390670.24 | backward_inner: 390663.64 | backward_allreduce_microstep: 7.95 | backward_allreduce: 2.74 | reduce_tied_grads: 0.29 | comms: 18.44 | reduce_grads: 0.19 | step: 360.26 | _step_clipping: 0.14 | _step_step: 358.38 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.314 | iteration 10350/ 143000 | elapsed time per iteration (ms): 62769.9 | learning rate: 5.943E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.501250E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 11:36:45,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=10360, skipped=6, lr=[0.0005942529293757813, 0.0005942529293757813], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10360 loss: 2.5309 iter time (s): 62.634 samples/sec: 16.349 %comms: 0.0028722466814262598 %optimizer_step 0.05594645951300057 %forward: 23.252038527939987 %backward: 62.35444231604479 [2025-04-01 11:36:45,013] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19865.28 | forward: 145637.49 | backward_microstep: 390561.99 | backward: 390552.60 | backward_inner_microstep: 390536.45 | backward_inner: 390530.04 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.65 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 350.42 | _step_clipping: 0.12 | _step_step: 348.69 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.349 | iteration 10360/ 143000 | elapsed time per iteration (ms): 62634.8 | learning rate: 5.943E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.506607E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 11:47:05,062] [INFO] [logging.py:60:log_dist] [Rank 0] step=10370, skipped=6, lr=[0.0005942400835241424, 0.0005942400835241424], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10370 loss: 2.5099 iter time (s): 62.004 samples/sec: 16.515 %comms: 0.002892310471442508 %optimizer_step 0.05702533911756771 %forward: 23.431709390179893 %backward: 62.95349197215291 [2025-04-01 11:47:05,063] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14153.68 | forward: 145286.99 | backward_microstep: 390346.16 | backward: 390339.56 | backward_inner_microstep: 390324.26 | backward_inner: 390318.50 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.66 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.20 | step: 353.58 | _step_clipping: 0.12 | _step_step: 351.90 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.515 | iteration 10370/ 143000 | elapsed time per iteration (ms): 62005.0 | learning rate: 5.942E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.511076E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 11:57:35,987] [INFO] [logging.py:60:log_dist] [Rank 0] step=10380, skipped=6, lr=[0.0005942272234711577, 0.0005942272234711577], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10380 loss: 2.5229 iter time (s): 63.092 samples/sec: 16.230 %comms: 0.004269036561466988 %optimizer_step 0.05740427636001679 %forward: 23.119670527359446 %backward: 61.89584377389169 [2025-04-01 11:57:35,990] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24231.08 | forward: 145866.31 | backward_microstep: 390520.47 | backward: 390512.41 | backward_inner_microstep: 390496.48 | backward_inner: 390490.32 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.63 | reduce_tied_grads: 0.29 | comms: 26.93 | reduce_grads: 0.18 | step: 362.17 | _step_clipping: 0.13 | _step_step: 360.44 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.230 | iteration 10380/ 143000 | elapsed time per iteration (ms): 63092.7 | learning rate: 5.942E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.510202E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 12:07:55,279] [INFO] [logging.py:60:log_dist] [Rank 0] step=10390, skipped=6, lr=[0.0005942143492174478, 0.0005942143492174478], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10390 loss: 2.5060 iter time (s): 61.928 samples/sec: 16.535 %comms: 0.0028983230112024964 %optimizer_step 0.057976084983328915 %forward: 23.45787113506819 %backward: 63.0307310940969 [2025-04-01 12:07:55,280] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13409.78 | forward: 145270.97 | backward_microstep: 390346.34 | backward: 390339.56 | backward_inner_microstep: 390323.88 | backward_inner: 390318.07 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.63 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.21 | step: 359.04 | _step_clipping: 0.11 | _step_step: 357.29 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.535 | iteration 10390/ 143000 | elapsed time per iteration (ms): 61929.0 | learning rate: 5.942E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.514423E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 12:18:16,107] [INFO] [logging.py:60:log_dist] [Rank 0] step=10400, skipped=6, lr=[0.0005942014607636342, 0.0005942014607636342], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10400 loss: 2.5132 iter time (s): 62.082 samples/sec: 16.494 %comms: 0.0028899938464556495 %optimizer_step 0.05791827554483094 %forward: 23.41501997672867 %backward: 62.89404657385016 [2025-04-01 12:18:16,107] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14704.62 | forward: 145365.56 | backward_microstep: 390467.39 | backward: 390459.99 | backward_inner_microstep: 390443.60 | backward_inner: 390437.55 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.75 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.19 | step: 359.57 | _step_clipping: 0.11 | _step_step: 357.84 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.494 | iteration 10400/ 143000 | elapsed time per iteration (ms): 62082.7 | learning rate: 5.942E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.507811E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 12:28:40,425] [INFO] [logging.py:60:log_dist] [Rank 0] step=10410, skipped=6, lr=[0.0005941885581103388, 0.0005941885581103388], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10410 loss: 2.5169 iter time (s): 62.431 samples/sec: 16.402 %comms: 0.002880174772452894 %optimizer_step 0.05860435226811468 %forward: 23.3211007699937 %backward: 62.55374874265023 [2025-04-01 12:28:40,425] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17867.04 | forward: 145596.54 | backward_microstep: 390538.49 | backward: 390530.87 | backward_inner_microstep: 390513.89 | backward_inner: 390507.48 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.22 | step: 365.87 | _step_clipping: 0.12 | _step_step: 363.99 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.402 | iteration 10410/ 143000 | elapsed time per iteration (ms): 62431.8 | learning rate: 5.942E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.508163E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 12:39:08,076] [INFO] [logging.py:60:log_dist] [Rank 0] step=10420, skipped=6, lr=[0.0005941756412581844, 0.0005941756412581844], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10420 loss: 2.5010 iter time (s): 62.765 samples/sec: 16.315 %comms: 0.0028598279964379258 %optimizer_step 0.05665928331449269 %forward: 23.211655739343893 %backward: 62.221308069462545 [2025-04-01 12:39:08,077] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21102.65 | forward: 145686.91 | backward_microstep: 390538.14 | backward: 390529.23 | backward_inner_microstep: 390512.78 | backward_inner: 390506.48 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.70 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.19 | step: 355.62 | _step_clipping: 0.12 | _step_step: 353.99 | _step_zero_grad: 0.48 | _step_check_overflow: 0.45 samples/sec: 16.315 | iteration 10420/ 143000 | elapsed time per iteration (ms): 62765.1 | learning rate: 5.942E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.497761E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 12:49:32,467] [INFO] [logging.py:60:log_dist] [Rank 0] step=10430, skipped=6, lr=[0.0005941627102077946, 0.0005941627102077946], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10430 loss: 2.4963 iter time (s): 62.438 samples/sec: 16.400 %comms: 0.0029446789541327277 %optimizer_step 0.062203302460215444 %forward: 23.37898233958839 %backward: 62.568504127674515 [2025-04-01 12:49:32,468] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17370.51 | forward: 145974.79 | backward_microstep: 390678.72 | backward: 390668.18 | backward_inner_microstep: 390651.28 | backward_inner: 390644.90 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.80 | reduce_tied_grads: 0.33 | comms: 18.39 | reduce_grads: 0.23 | step: 388.39 | _step_clipping: 0.14 | _step_step: 386.32 | _step_zero_grad: 0.59 | _step_check_overflow: 0.66 samples/sec: 16.400 | iteration 10430/ 143000 | elapsed time per iteration (ms): 62439.1 | learning rate: 5.942E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.508251E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 13:00:02,418] [INFO] [logging.py:60:log_dist] [Rank 0] step=10440, skipped=6, lr=[0.0005941497649597931, 0.0005941497649597931], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10440 loss: 2.5147 iter time (s): 62.995 samples/sec: 16.255 %comms: 0.00288666812086675 %optimizer_step 0.05912202573658768 %forward: 23.140863094738407 %backward: 62.04367476750837 [2025-04-01 13:00:02,419] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22933.16 | forward: 145774.73 | backward_microstep: 390854.22 | backward: 390841.07 | backward_inner_microstep: 390823.44 | backward_inner: 390816.58 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.84 | reduce_tied_grads: 0.35 | comms: 18.18 | reduce_grads: 0.22 | step: 372.44 | _step_clipping: 0.14 | _step_step: 370.38 | _step_zero_grad: 0.57 | _step_check_overflow: 0.69 samples/sec: 16.255 | iteration 10440/ 143000 | elapsed time per iteration (ms): 62995.1 | learning rate: 5.941E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.509608E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 13:10:29,000] [INFO] [logging.py:60:log_dist] [Rank 0] step=10450, skipped=6, lr=[0.0005941368055148049, 0.0005941368055148049], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10450 loss: 2.5010 iter time (s): 62.658 samples/sec: 16.343 %comms: 0.0028893321176791786 %optimizer_step 0.05776666554031686 %forward: 23.237091367297502 %backward: 62.371323129155044 [2025-04-01 13:10:29,001] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19793.23 | forward: 145597.85 | backward_microstep: 390815.68 | backward: 390803.23 | backward_inner_microstep: 390785.36 | backward_inner: 390778.68 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.82 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.20 | step: 361.95 | _step_clipping: 0.12 | _step_step: 360.10 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.343 | iteration 10450/ 143000 | elapsed time per iteration (ms): 62658.2 | learning rate: 5.941E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.502179E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 13:21:00,348] [INFO] [logging.py:60:log_dist] [Rank 0] step=10460, skipped=6, lr=[0.0005941238318734556, 0.0005941238318734556], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10460 loss: 2.5064 iter time (s): 63.134 samples/sec: 16.219 %comms: 0.002859320549449221 %optimizer_step 0.05624477173307396 %forward: 23.06173305052162 %backward: 61.8861455814931 [2025-04-01 13:21:00,349] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24673.07 | forward: 145598.48 | backward_microstep: 390727.18 | backward: 390713.44 | backward_inner_microstep: 390693.75 | backward_inner: 390686.98 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.20 | step: 355.10 | _step_clipping: 0.11 | _step_step: 353.39 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.219 | iteration 10460/ 143000 | elapsed time per iteration (ms): 63134.8 | learning rate: 5.941E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.505374E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 13:31:29,729] [INFO] [logging.py:60:log_dist] [Rank 0] step=10470, skipped=6, lr=[0.0005941108440363714, 0.0005941108440363714], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10470 loss: 2.5143 iter time (s): 62.937 samples/sec: 16.270 %comms: 0.0028781487980888954 %optimizer_step 0.06027402794966937 %forward: 23.13555311155709 %backward: 62.06565426794294 [2025-04-01 13:31:29,729] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22773.55 | forward: 145609.22 | backward_microstep: 390637.90 | backward: 390625.27 | backward_inner_microstep: 390606.76 | backward_inner: 390600.13 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.70 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 379.35 | _step_clipping: 0.15 | _step_step: 375.77 | _step_zero_grad: 0.55 | _step_check_overflow: 0.66 samples/sec: 16.270 | iteration 10470/ 143000 | elapsed time per iteration (ms): 62938.0 | learning rate: 5.941E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.500493E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 13:42:02,138] [INFO] [logging.py:60:log_dist] [Rank 0] step=10480, skipped=6, lr=[0.000594097842004179, 0.000594097842004179], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10480 loss: 2.5039 iter time (s): 63.240 samples/sec: 16.192 %comms: 0.0029333957431424166 %optimizer_step 0.05786598882886233 %forward: 23.031502954157375 %backward: 61.76465541033607 [2025-04-01 13:42:02,138] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25796.13 | forward: 145651.84 | backward_microstep: 390614.55 | backward: 390601.33 | backward_inner_microstep: 390584.16 | backward_inner: 390577.47 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.78 | reduce_tied_grads: 0.38 | comms: 18.55 | reduce_grads: 0.22 | step: 365.95 | _step_clipping: 0.14 | _step_step: 363.87 | _step_zero_grad: 0.61 | _step_check_overflow: 0.66 samples/sec: 16.192 | iteration 10480/ 143000 | elapsed time per iteration (ms): 63240.9 | learning rate: 5.941E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.507807E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 13:52:39,024] [INFO] [logging.py:60:log_dist] [Rank 0] step=10490, skipped=6, lr=[0.000594084825777506, 0.000594084825777506], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10490 loss: 2.5012 iter time (s): 63.688 samples/sec: 16.078 %comms: 0.0028215024166295477 %optimizer_step 0.05581157889614227 %forward: 22.97411307620421 %backward: 61.36415736588206 [2025-04-01 13:52:39,025] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29351.38 | forward: 146317.72 | backward_microstep: 390830.47 | backward: 390816.56 | backward_inner_microstep: 390798.59 | backward_inner: 390791.93 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.21 | step: 355.45 | _step_clipping: 0.13 | _step_step: 353.76 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 16.078 | iteration 10490/ 143000 | elapsed time per iteration (ms): 63688.6 | learning rate: 5.941E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.494132E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 14:03:10,262] [INFO] [logging.py:60:log_dist] [Rank 0] step=10500, skipped=6, lr=[0.0005940717953569804, 0.0005940717953569804], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10500 loss: 2.5114 iter time (s): 63.123 samples/sec: 16.222 %comms: 0.0028798369045542546 %optimizer_step 0.05656598566964285 %forward: 23.113832404435733 %backward: 61.9102080865954 [2025-04-01 14:03:10,263] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24187.99 | forward: 145901.99 | backward_microstep: 390811.14 | backward: 390797.28 | backward_inner_microstep: 390779.71 | backward_inner: 390772.85 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.84 | reduce_tied_grads: 0.31 | comms: 18.18 | reduce_grads: 0.19 | step: 357.06 | _step_clipping: 0.15 | _step_step: 355.26 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.222 | iteration 10500/ 143000 | elapsed time per iteration (ms): 63123.8 | learning rate: 5.941E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.508811E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 14:13:46,987] [INFO] [logging.py:60:log_dist] [Rank 0] step=10510, skipped=6, lr=[0.0005940587507432314, 0.0005940587507432314], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10510 loss: 2.5024 iter time (s): 63.672 samples/sec: 16.082 %comms: 0.0038250316775612675 %optimizer_step 0.060229111829876184 %forward: 22.906899930199828 %backward: 61.33993718581365 [2025-04-01 14:13:46,988] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29928.98 | forward: 145852.54 | backward_microstep: 390575.93 | backward: 390562.92 | backward_inner_microstep: 390545.70 | backward_inner: 390539.07 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 24.35 | reduce_grads: 0.23 | step: 383.49 | _step_clipping: 0.20 | _step_step: 381.25 | _step_zero_grad: 0.62 | _step_check_overflow: 0.72 samples/sec: 16.082 | iteration 10510/ 143000 | elapsed time per iteration (ms): 63672.5 | learning rate: 5.941E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.510704E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 14:24:26,317] [INFO] [logging.py:60:log_dist] [Rank 0] step=10520, skipped=6, lr=[0.0005940456919368885, 0.0005940456919368885], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10520 loss: 2.5020 iter time (s): 63.932 samples/sec: 16.017 %comms: 0.0028285492468544224 %optimizer_step 0.055980386181387135 %forward: 22.807652351046332 %backward: 61.10850900922732 [2025-04-01 14:24:26,317] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32484.28 | forward: 145814.62 | backward_microstep: 390691.64 | backward: 390680.89 | backward_inner_microstep: 390664.08 | backward_inner: 390657.55 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.74 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.21 | step: 357.90 | _step_clipping: 0.19 | _step_step: 356.06 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.017 | iteration 10520/ 143000 | elapsed time per iteration (ms): 63932.9 | learning rate: 5.940E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.508497E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 14:34:58,718] [INFO] [logging.py:60:log_dist] [Rank 0] step=10530, skipped=6, lr=[0.000594032618938582, 0.000594032618938582], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10530 loss: 2.4934 iter time (s): 63.240 samples/sec: 16.192 %comms: 0.0028658326252369187 %optimizer_step 0.056497789324837754 %forward: 23.062454229125116 %backward: 61.77541967383598 [2025-04-01 14:34:58,718] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25523.31 | forward: 145845.86 | backward_microstep: 390677.39 | backward: 390664.80 | backward_inner_microstep: 390647.94 | backward_inner: 390641.40 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.78 | reduce_tied_grads: 0.32 | comms: 18.12 | reduce_grads: 0.19 | step: 357.29 | _step_clipping: 0.14 | _step_step: 355.55 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.192 | iteration 10530/ 143000 | elapsed time per iteration (ms): 63240.1 | learning rate: 5.940E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.508772E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 14:45:32,066] [INFO] [logging.py:60:log_dist] [Rank 0] step=10540, skipped=6, lr=[0.0005940195317489428, 0.0005940195317489428], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10540 loss: 2.5574 iter time (s): 63.334 samples/sec: 16.168 %comms: 0.0028857099383728687 %optimizer_step 0.05629963359592396 %forward: 22.993570410357858 %backward: 61.64594880055954 [2025-04-01 14:45:32,067] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26932.35 | forward: 145628.26 | backward_microstep: 390439.92 | backward: 390430.54 | backward_inner_microstep: 390413.98 | backward_inner: 390407.70 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.23 | step: 356.57 | _step_clipping: 0.12 | _step_step: 354.90 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.168 | iteration 10540/ 143000 | elapsed time per iteration (ms): 63334.9 | learning rate: 5.940E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.509351E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 14:56:10,481] [INFO] [logging.py:60:log_dist] [Rank 0] step=10550, skipped=6, lr=[0.0005940064303686025, 0.0005940064303686025], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10550 loss: 2.5166 iter time (s): 63.841 samples/sec: 16.040 %comms: 0.0028382423387190367 %optimizer_step 0.055780635947993526 %forward: 22.827797783184657 %backward: 61.16139432712322 [2025-04-01 14:56:10,481] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31877.62 | forward: 145734.55 | backward_microstep: 390468.13 | backward: 390459.39 | backward_inner_microstep: 390442.66 | backward_inner: 390436.23 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.77 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.21 | step: 356.11 | _step_clipping: 0.14 | _step_step: 354.36 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.040 | iteration 10550/ 143000 | elapsed time per iteration (ms): 63841.4 | learning rate: 5.940E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.510062E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 15:06:38,807] [INFO] [logging.py:60:log_dist] [Rank 0] step=10560, skipped=6, lr=[0.0005939933147981936, 0.0005939933147981936], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10560 loss: 2.4900 iter time (s): 62.832 samples/sec: 16.297 %comms: 0.0029216053020200954 %optimizer_step 0.058529473911107584 %forward: 23.16737526311651 %backward: 62.19068802545834 [2025-04-01 15:06:38,807] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21612.85 | forward: 145565.32 | backward_microstep: 390773.57 | backward: 390756.70 | backward_inner_microstep: 390739.24 | backward_inner: 390731.78 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.76 | reduce_tied_grads: 0.37 | comms: 18.36 | reduce_grads: 0.22 | step: 367.75 | _step_clipping: 0.14 | _step_step: 365.87 | _step_zero_grad: 0.54 | _step_check_overflow: 0.56 samples/sec: 16.297 | iteration 10560/ 143000 | elapsed time per iteration (ms): 62832.6 | learning rate: 5.940E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.502905E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 15:17:08,225] [INFO] [logging.py:60:log_dist] [Rank 0] step=10570, skipped=6, lr=[0.000593980185038349, 0.000593980185038349], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10570 loss: 2.5109 iter time (s): 62.941 samples/sec: 16.269 %comms: 0.0031763879678560147 %optimizer_step 0.057712872743971674 %forward: 23.13112359649453 %backward: 61.9976361317334 [2025-04-01 15:17:08,226] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23316.19 | forward: 145590.23 | backward_microstep: 390232.30 | backward: 390221.01 | backward_inner_microstep: 390204.45 | backward_inner: 390198.13 | backward_allreduce_microstep: 7.88 | backward_allreduce: 2.72 | reduce_tied_grads: 0.32 | comms: 19.99 | reduce_grads: 0.21 | step: 363.25 | _step_clipping: 0.13 | _step_step: 361.47 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.269 | iteration 10570/ 143000 | elapsed time per iteration (ms): 62941.8 | learning rate: 5.940E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.503186E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 15:27:38,716] [INFO] [logging.py:60:log_dist] [Rank 0] step=10580, skipped=6, lr=[0.0005939670410897024, 0.0005939670410897024], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10580 loss: 2.4953 iter time (s): 63.048 samples/sec: 16.241 %comms: 0.0028534163229437036 %optimizer_step 0.05724530127090446 %forward: 23.08955397345939 %backward: 61.91033704711523 [2025-04-01 15:27:38,716] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24264.26 | forward: 145576.08 | backward_microstep: 390345.63 | backward: 390335.15 | backward_inner_microstep: 390316.87 | backward_inner: 390310.50 | backward_allreduce_microstep: 9.56 | backward_allreduce: 2.77 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 360.92 | _step_clipping: 0.16 | _step_step: 359.19 | _step_zero_grad: 0.51 | _step_check_overflow: 0.48 samples/sec: 16.241 | iteration 10580/ 143000 | elapsed time per iteration (ms): 63049.0 | learning rate: 5.940E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.494278E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 15:38:07,315] [INFO] [logging.py:60:log_dist] [Rank 0] step=10590, skipped=6, lr=[0.0005939538829528882, 0.0005939538829528882], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10590 loss: 2.4993 iter time (s): 62.859 samples/sec: 16.290 %comms: 0.0028601022116184214 %optimizer_step 0.0563556615244996 %forward: 23.150783547449073 %backward: 62.08686687477981 [2025-04-01 15:38:07,316] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22503.37 | forward: 145524.43 | backward_microstep: 390284.21 | backward: 390274.30 | backward_inner_microstep: 390257.76 | backward_inner: 390251.46 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.19 | step: 354.25 | _step_clipping: 0.12 | _step_step: 352.55 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.290 | iteration 10590/ 143000 | elapsed time per iteration (ms): 62860.0 | learning rate: 5.940E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.495187E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 15:48:37,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=10600, skipped=6, lr=[0.0005939407106285415, 0.0005939407106285415], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10600 loss: 2.4849 iter time (s): 63.006 samples/sec: 16.252 %comms: 0.002878571924631901 %optimizer_step 0.05599144227672025 %forward: 23.08358449747809 %backward: 61.97795473744646 [2025-04-01 15:48:37,381] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23764.52 | forward: 145440.51 | backward_microstep: 390508.37 | backward: 390498.51 | backward_inner_microstep: 390481.82 | backward_inner: 390475.30 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.73 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.21 | step: 352.78 | _step_clipping: 0.14 | _step_step: 351.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.252 | iteration 10600/ 143000 | elapsed time per iteration (ms): 63006.6 | learning rate: 5.939E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.501032E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 15:59:08,900] [INFO] [logging.py:60:log_dist] [Rank 0] step=10610, skipped=6, lr=[0.0005939275241172981, 0.0005939275241172981], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10610 loss: 2.4828 iter time (s): 63.151 samples/sec: 16.215 %comms: 0.0028696104855534675 %optimizer_step 0.05637229723666202 %forward: 23.067188050907053 %backward: 61.84793235256375 [2025-04-01 15:59:08,900] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24901.32 | forward: 145672.25 | backward_microstep: 390588.60 | backward: 390577.61 | backward_inner_microstep: 390560.77 | backward_inner: 390553.93 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.72 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.19 | step: 356.00 | _step_clipping: 0.12 | _step_step: 354.19 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.215 | iteration 10610/ 143000 | elapsed time per iteration (ms): 63151.9 | learning rate: 5.939E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.494644E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 16:09:42,542] [INFO] [logging.py:60:log_dist] [Rank 0] step=10620, skipped=6, lr=[0.0005939143234197943, 0.0005939143234197943], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10620 loss: 2.5047 iter time (s): 63.364 samples/sec: 16.161 %comms: 0.0028570211379690836 %optimizer_step 0.056878312799588535 %forward: 22.97272277298825 %backward: 61.615097054466105 [2025-04-01 16:09:42,542] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27323.50 | forward: 145563.51 | backward_microstep: 390424.92 | backward: 390415.61 | backward_inner_microstep: 390399.10 | backward_inner: 390392.70 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.70 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.20 | step: 360.40 | _step_clipping: 0.13 | _step_step: 358.60 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.161 | iteration 10620/ 143000 | elapsed time per iteration (ms): 63364.2 | learning rate: 5.939E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.500829E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 16:20:13,649] [INFO] [logging.py:60:log_dist] [Rank 0] step=10630, skipped=6, lr=[0.0005939011085366673, 0.0005939011085366673], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10630 loss: 2.5004 iter time (s): 63.110 samples/sec: 16.226 %comms: 0.0028525877187780347 %optimizer_step 0.056170730945378555 %forward: 23.07089838871517 %backward: 61.88099781510874 [2025-04-01 16:20:13,651] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24595.19 | forward: 145600.98 | backward_microstep: 390542.04 | backward: 390532.44 | backward_inner_microstep: 390513.44 | backward_inner: 390507.05 | backward_allreduce_microstep: 10.01 | backward_allreduce: 2.75 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.20 | step: 354.49 | _step_clipping: 0.14 | _step_step: 352.71 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.225 | iteration 10630/ 143000 | elapsed time per iteration (ms): 63110.9 | learning rate: 5.939E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.506212E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 16:30:39,369] [INFO] [logging.py:60:log_dist] [Rank 0] step=10640, skipped=6, lr=[0.0005938878794685549, 0.0005938878794685549], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10640 loss: 2.4988 iter time (s): 62.571 samples/sec: 16.365 %comms: 0.002864734961919454 %optimizer_step 0.05528542581205684 %forward: 23.242903601058877 %backward: 62.39603428897574 [2025-04-01 16:30:39,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19496.35 | forward: 145433.91 | backward_microstep: 390429.54 | backward: 390420.21 | backward_inner_microstep: 390403.49 | backward_inner: 390397.28 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.80 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.19 | step: 345.93 | _step_clipping: 0.13 | _step_step: 343.97 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.365 | iteration 10640/ 143000 | elapsed time per iteration (ms): 62571.9 | learning rate: 5.939E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.498307E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 16:41:09,929] [INFO] [logging.py:60:log_dist] [Rank 0] step=10650, skipped=6, lr=[0.0005938746362160956, 0.0005938746362160956], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10650 loss: 2.5093 iter time (s): 63.055 samples/sec: 16.240 %comms: 0.002877072905629001 %optimizer_step 0.05669585405347333 %forward: 23.114521920849363 %backward: 61.94274057753566 [2025-04-01 16:41:09,930] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23851.00 | forward: 145749.62 | backward_microstep: 390591.74 | backward: 390582.64 | backward_inner_microstep: 390566.11 | backward_inner: 390559.80 | backward_allreduce_microstep: 7.88 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 18.14 | reduce_grads: 0.22 | step: 357.50 | _step_clipping: 0.14 | _step_step: 355.74 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.240 | iteration 10650/ 143000 | elapsed time per iteration (ms): 63056.0 | learning rate: 5.939E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.495944E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 16:51:36,595] [INFO] [logging.py:60:log_dist] [Rank 0] step=10660, skipped=6, lr=[0.0005938613787799285, 0.0005938613787799285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10660 loss: 2.4975 iter time (s): 62.666 samples/sec: 16.341 %comms: 0.002859033126061935 %optimizer_step 0.056396118599425143 %forward: 23.229165380002843 %backward: 62.27994618740155 [2025-04-01 16:51:36,596] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20475.41 | forward: 145568.08 | backward_microstep: 390292.19 | backward: 390284.03 | backward_inner_microstep: 390268.12 | backward_inner: 390262.01 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.18 | step: 353.41 | _step_clipping: 0.12 | _step_step: 351.71 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.340 | iteration 10660/ 143000 | elapsed time per iteration (ms): 62666.6 | learning rate: 5.939E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.492602E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 17:02:06,512] [INFO] [logging.py:60:log_dist] [Rank 0] step=10670, skipped=6, lr=[0.0005938481071606938, 0.0005938481071606938], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10670 loss: 2.5128 iter time (s): 62.991 samples/sec: 16.256 %comms: 0.0028583609868627884 %optimizer_step 0.05694182560231268 %forward: 23.08251347453167 %backward: 61.943961076429076 [2025-04-01 17:02:06,513] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24011.74 | forward: 145399.31 | backward_microstep: 390199.60 | backward: 390191.88 | backward_inner_microstep: 390176.15 | backward_inner: 390170.13 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.19 | step: 358.68 | _step_clipping: 0.13 | _step_step: 356.99 | _step_zero_grad: 0.50 | _step_check_overflow: 0.48 samples/sec: 16.256 | iteration 10670/ 143000 | elapsed time per iteration (ms): 62991.7 | learning rate: 5.938E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.505887E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 17:12:32,091] [INFO] [logging.py:60:log_dist] [Rank 0] step=10680, skipped=6, lr=[0.0005938348213590317, 0.0005938348213590317], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10680 loss: 2.5099 iter time (s): 62.557 samples/sec: 16.369 %comms: 0.002880053314951808 %optimizer_step 0.05625349953651676 %forward: 23.239460544976243 %backward: 62.386748386253544 [2025-04-01 17:12:32,091] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19574.02 | forward: 145379.62 | backward_microstep: 390283.85 | backward: 390274.20 | backward_inner_microstep: 390258.38 | backward_inner: 390252.25 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 351.91 | _step_clipping: 0.14 | _step_step: 350.16 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.369 | iteration 10680/ 143000 | elapsed time per iteration (ms): 62557.8 | learning rate: 5.938E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.507306E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 17:23:00,188] [INFO] [logging.py:60:log_dist] [Rank 0] step=10690, skipped=6, lr=[0.0005938215213755834, 0.0005938215213755834], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10690 loss: 2.5084 iter time (s): 62.809 samples/sec: 16.303 %comms: 0.0028795097730154773 %optimizer_step 0.056389735435373296 %forward: 23.204155000276653 %backward: 62.14272633758764 [2025-04-01 17:23:00,188] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21694.31 | forward: 145743.33 | backward_microstep: 390324.18 | backward: 390313.19 | backward_inner_microstep: 390294.77 | backward_inner: 390288.54 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.34 | step: 354.18 | _step_clipping: 0.13 | _step_step: 352.41 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.303 | iteration 10690/ 143000 | elapsed time per iteration (ms): 62809.7 | learning rate: 5.938E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.500453E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 17:33:25,395] [INFO] [logging.py:60:log_dist] [Rank 0] step=10700, skipped=6, lr=[0.000593808207210991, 0.000593808207210991], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10700 loss: 2.5101 iter time (s): 62.520 samples/sec: 16.379 %comms: 0.0028811879224636934 %optimizer_step 0.05705006063309789 %forward: 23.272715227444415 %backward: 62.43073679510586 [2025-04-01 17:33:25,396] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19052.29 | forward: 145501.44 | backward_microstep: 390327.14 | backward: 390318.09 | backward_inner_microstep: 390301.82 | backward_inner: 390295.62 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.66 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.19 | step: 356.68 | _step_clipping: 0.13 | _step_step: 354.79 | _step_zero_grad: 0.46 | _step_check_overflow: 0.73 samples/sec: 16.379 | iteration 10700/ 143000 | elapsed time per iteration (ms): 62520.7 | learning rate: 5.938E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.500448E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 17:43:54,528] [INFO] [logging.py:60:log_dist] [Rank 0] step=10710, skipped=6, lr=[0.0005937948788658971, 0.0005937948788658971], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10710 loss: 2.5200 iter time (s): 62.913 samples/sec: 16.277 %comms: 0.0028785608329099668 %optimizer_step 0.0562032200160236 %forward: 23.092567176166163 %backward: 62.03134009364521 [2025-04-01 17:43:54,528] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23242.91 | forward: 145281.54 | backward_microstep: 390265.12 | backward: 390255.80 | backward_inner_microstep: 390238.99 | backward_inner: 390232.43 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.77 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 353.59 | _step_clipping: 0.13 | _step_step: 351.80 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.276 | iteration 10710/ 143000 | elapsed time per iteration (ms): 62913.3 | learning rate: 5.938E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.497173E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 17:54:23,730] [INFO] [logging.py:60:log_dist] [Rank 0] step=10720, skipped=6, lr=[0.0005937815363409448, 0.0005937815363409448], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10720 loss: 2.4960 iter time (s): 62.920 samples/sec: 16.275 %comms: 0.0028980601665785782 %optimizer_step 0.05699941461086841 %forward: 23.12008946031064 %backward: 62.03574250611019 [2025-04-01 17:54:23,730] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23025.02 | forward: 145470.78 | backward_microstep: 390336.62 | backward: 390326.68 | backward_inner_microstep: 390309.88 | backward_inner: 390301.66 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 18.23 | reduce_grads: 0.20 | step: 358.64 | _step_clipping: 0.14 | _step_step: 356.79 | _step_zero_grad: 0.56 | _step_check_overflow: 0.55 samples/sec: 16.275 | iteration 10720/ 143000 | elapsed time per iteration (ms): 62920.2 | learning rate: 5.938E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.501052E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 18:04:50,134] [INFO] [logging.py:60:log_dist] [Rank 0] step=10730, skipped=6, lr=[0.0005937681796367783, 0.0005937681796367783], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10730 loss: 2.4967 iter time (s): 62.639 samples/sec: 16.348 %comms: 0.0036601700032667635 %optimizer_step 0.058666020712489574 %forward: 23.228056056466382 %backward: 62.33335728469901 [2025-04-01 18:04:50,135] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20025.27 | forward: 145498.90 | backward_microstep: 390461.28 | backward: 390451.74 | backward_inner_microstep: 390435.22 | backward_inner: 390428.78 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.70 | reduce_tied_grads: 0.31 | comms: 22.93 | reduce_grads: 0.19 | step: 367.48 | _step_clipping: 0.13 | _step_step: 365.60 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 16.347 | iteration 10730/ 143000 | elapsed time per iteration (ms): 62640.4 | learning rate: 5.938E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.490444E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 18:15:27,728] [INFO] [logging.py:60:log_dist] [Rank 0] step=10740, skipped=6, lr=[0.0005937548087540421, 0.0005937548087540421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10740 loss: 2.4975 iter time (s): 63.759 samples/sec: 16.061 %comms: 0.0028270849431832304 %optimizer_step 0.05517114058770265 %forward: 22.800724783315182 %backward: 61.20708970530859 [2025-04-01 18:15:27,728] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31607.58 | forward: 145374.73 | backward_microstep: 390258.83 | backward: 390249.18 | backward_inner_microstep: 390233.12 | backward_inner: 390225.33 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.63 | reduce_tied_grads: 0.34 | comms: 18.03 | reduce_grads: 0.22 | step: 351.76 | _step_clipping: 0.15 | _step_step: 350.04 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.060 | iteration 10740/ 143000 | elapsed time per iteration (ms): 63759.4 | learning rate: 5.938E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.490408E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 18:25:57,151] [INFO] [logging.py:60:log_dist] [Rank 0] step=10750, skipped=6, lr=[0.0005937414236933817, 0.0005937414236933817], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10750 loss: 2.5059 iter time (s): 62.942 samples/sec: 16.269 %comms: 0.002898217634223605 %optimizer_step 0.057341541957502414 %forward: 23.125891584171566 %backward: 62.04034707057696 [2025-04-01 18:25:57,151] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22950.00 | forward: 145558.36 | backward_microstep: 390502.07 | backward: 390492.66 | backward_inner_microstep: 390475.88 | backward_inner: 390469.56 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.74 | reduce_tied_grads: 0.32 | comms: 18.24 | reduce_grads: 0.20 | step: 360.92 | _step_clipping: 0.14 | _step_step: 358.94 | _step_zero_grad: 0.54 | _step_check_overflow: 0.68 samples/sec: 16.269 | iteration 10750/ 143000 | elapsed time per iteration (ms): 62942.3 | learning rate: 5.937E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.500315E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 18:36:27,428] [INFO] [logging.py:60:log_dist] [Rank 0] step=10760, skipped=6, lr=[0.0005937280244554428, 0.0005937280244554428], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10760 loss: 2.4949 iter time (s): 63.027 samples/sec: 16.247 %comms: 0.0028741255050235117 %optimizer_step 0.05587909918717273 %forward: 23.08902867204173 %backward: 61.954249848077794 [2025-04-01 18:36:27,429] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23836.21 | forward: 145523.66 | backward_microstep: 390489.34 | backward: 390480.22 | backward_inner_microstep: 390463.04 | backward_inner: 390456.72 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.78 | reduce_tied_grads: 0.30 | comms: 18.11 | reduce_grads: 0.19 | step: 352.19 | _step_clipping: 0.14 | _step_step: 350.44 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.247 | iteration 10760/ 143000 | elapsed time per iteration (ms): 63027.7 | learning rate: 5.937E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.500200E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 18:46:55,670] [INFO] [logging.py:60:log_dist] [Rank 0] step=10770, skipped=6, lr=[0.0005937146110408726, 0.0005937146110408726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10770 loss: 2.4757 iter time (s): 62.824 samples/sec: 16.300 %comms: 0.0028638957727440714 %optimizer_step 0.05596361408731977 %forward: 23.13420295704563 %backward: 62.12328247396527 [2025-04-01 18:46:55,671] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22230.24 | forward: 145337.35 | backward_microstep: 390288.33 | backward: 390280.73 | backward_inner_microstep: 390264.75 | backward_inner: 390256.90 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.65 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.20 | step: 351.58 | _step_clipping: 0.12 | _step_step: 349.87 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.299 | iteration 10770/ 143000 | elapsed time per iteration (ms): 62824.2 | learning rate: 5.937E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.490723E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 18:57:24,304] [INFO] [logging.py:60:log_dist] [Rank 0] step=10780, skipped=6, lr=[0.0005937011834503181, 0.0005937011834503181], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10780 loss: 2.4834 iter time (s): 62.863 samples/sec: 16.289 %comms: 0.0028659426879632135 %optimizer_step 0.05792818658828322 %forward: 23.116833860835122 %backward: 62.06368029380398 [2025-04-01 18:57:24,304] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22800.12 | forward: 145318.74 | backward_microstep: 390157.45 | backward: 390149.28 | backward_inner_microstep: 390133.22 | backward_inner: 390127.05 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 364.15 | _step_clipping: 0.19 | _step_step: 362.31 | _step_zero_grad: 0.53 | _step_check_overflow: 0.49 samples/sec: 16.289 | iteration 10780/ 143000 | elapsed time per iteration (ms): 62863.4 | learning rate: 5.937E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.492142E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 19:07:54,417] [INFO] [logging.py:60:log_dist] [Rank 0] step=10790, skipped=6, lr=[0.0005936877416844274, 0.0005936877416844274], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10790 loss: 2.5143 iter time (s): 63.011 samples/sec: 16.251 %comms: 0.0028626144109280246 %optimizer_step 0.055501384520836826 %forward: 23.096738545512636 %backward: 61.921445251927096 [2025-04-01 19:07:54,417] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24043.79 | forward: 145534.36 | backward_microstep: 390181.49 | backward: 390171.88 | backward_inner_microstep: 390156.01 | backward_inner: 390150.09 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.64 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.20 | step: 349.72 | _step_clipping: 0.13 | _step_step: 348.08 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.251 | iteration 10790/ 143000 | elapsed time per iteration (ms): 63011.3 | learning rate: 5.937E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.492222E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 19:18:21,919] [INFO] [logging.py:60:log_dist] [Rank 0] step=10800, skipped=6, lr=[0.0005936742857438495, 0.0005936742857438495], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10800 loss: 2.5017 iter time (s): 62.750 samples/sec: 16.319 %comms: 0.00287836180151358 %optimizer_step 0.055434681343731224 %forward: 23.153691158440772 %backward: 62.193114710614324 [2025-04-01 19:18:21,920] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21581.86 | forward: 145288.75 | backward_microstep: 390267.94 | backward: 390260.01 | backward_inner_microstep: 390244.67 | backward_inner: 390238.82 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.52 | reduce_tied_grads: 0.27 | comms: 18.06 | reduce_grads: 0.18 | step: 347.85 | _step_clipping: 0.11 | _step_step: 346.19 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.319 | iteration 10800/ 143000 | elapsed time per iteration (ms): 62750.3 | learning rate: 5.937E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.493356E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 19:28:41,466] [INFO] [logging.py:60:log_dist] [Rank 0] step=10810, skipped=6, lr=[0.0005936608156292336, 0.0005936608156292336], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10810 loss: 2.4946 iter time (s): 61.954 samples/sec: 16.528 %comms: 0.0028928121868316073 %optimizer_step 0.05603060605530169 %forward: 23.442200048939252 %backward: 62.98203439447053 [2025-04-01 19:28:41,467] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13761.98 | forward: 145234.08 | backward_microstep: 390207.39 | backward: 390199.64 | backward_inner_microstep: 390184.55 | backward_inner: 390178.69 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.19 | step: 347.13 | _step_clipping: 0.12 | _step_step: 345.50 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.528 | iteration 10810/ 143000 | elapsed time per iteration (ms): 61954.7 | learning rate: 5.937E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.492018E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 19:39:10,424] [INFO] [logging.py:60:log_dist] [Rank 0] step=10820, skipped=6, lr=[0.0005936473313412298, 0.0005936473313412298], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10820 loss: 2.4790 iter time (s): 62.895 samples/sec: 16.281 %comms: 0.002851651466773374 %optimizer_step 0.05934484313041665 %forward: 23.129913393259674 %backward: 62.036403178764054 [2025-04-01 19:39:10,424] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22943.86 | forward: 145476.02 | backward_microstep: 390186.90 | backward: 390179.10 | backward_inner_microstep: 390163.37 | backward_inner: 390157.40 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 17.94 | reduce_grads: 0.19 | step: 373.25 | _step_clipping: 0.12 | _step_step: 371.47 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.281 | iteration 10820/ 143000 | elapsed time per iteration (ms): 62895.7 | learning rate: 5.936E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.498797E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 19:49:40,784] [INFO] [logging.py:60:log_dist] [Rank 0] step=10830, skipped=6, lr=[0.0005936338328804893, 0.0005936338328804893], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10830 loss: 2.4966 iter time (s): 63.035 samples/sec: 16.245 %comms: 0.0028494658749083353 %optimizer_step 0.056045295055385115 %forward: 23.051787838940026 %backward: 61.895617156644434 [2025-04-01 19:49:40,784] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24540.20 | forward: 145308.02 | backward_microstep: 390169.40 | backward: 390161.90 | backward_inner_microstep: 390146.53 | backward_inner: 390140.61 | backward_allreduce_microstep: 7.36 | backward_allreduce: 2.53 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.22 | step: 353.28 | _step_clipping: 0.11 | _step_step: 351.49 | _step_zero_grad: 0.49 | _step_check_overflow: 0.65 samples/sec: 16.245 | iteration 10830/ 143000 | elapsed time per iteration (ms): 63036.0 | learning rate: 5.936E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.495916E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 20:00:07,375] [INFO] [logging.py:60:log_dist] [Rank 0] step=10840, skipped=6, lr=[0.0005936203202476631, 0.0005936203202476631], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10840 loss: 2.5157 iter time (s): 62.659 samples/sec: 16.343 %comms: 0.0028847205938104326 %optimizer_step 0.05622768025063506 %forward: 23.227156048658973 %backward: 62.30045837195429 [2025-04-01 20:00:07,375] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20303.02 | forward: 145537.89 | backward_microstep: 390373.24 | backward: 390365.35 | backward_inner_microstep: 390349.32 | backward_inner: 390343.14 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.66 | reduce_tied_grads: 0.32 | comms: 18.08 | reduce_grads: 0.19 | step: 352.31 | _step_clipping: 0.14 | _step_step: 350.59 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.342 | iteration 10840/ 143000 | elapsed time per iteration (ms): 62659.1 | learning rate: 5.936E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.503848E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 20:10:36,193] [INFO] [logging.py:60:log_dist] [Rank 0] step=10850, skipped=6, lr=[0.0005936067934434039, 0.0005936067934434039], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10850 loss: 2.4914 iter time (s): 62.881 samples/sec: 16.285 %comms: 0.0028720356050638525 %optimizer_step 0.05604609820222416 %forward: 23.138485580281493 %backward: 62.079272737129145 [2025-04-01 20:10:36,194] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22547.78 | forward: 145497.78 | backward_microstep: 390370.27 | backward: 390362.48 | backward_inner_microstep: 390346.68 | backward_inner: 390340.64 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.60 | reduce_tied_grads: 0.35 | comms: 18.06 | reduce_grads: 0.19 | step: 352.43 | _step_clipping: 0.17 | _step_step: 350.54 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.285 | iteration 10850/ 143000 | elapsed time per iteration (ms): 62881.9 | learning rate: 5.936E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.505162E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 20:21:01,385] [INFO] [logging.py:60:log_dist] [Rank 0] step=10860, skipped=6, lr=[0.0005935932524683639, 0.0005935932524683639], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10860 loss: 2.4934 iter time (s): 62.519 samples/sec: 16.379 %comms: 0.002865015727657657 %optimizer_step 0.05713949658726162 %forward: 23.28768567426642 %backward: 62.45605555649713 [2025-04-01 20:21:01,385] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18699.26 | forward: 145591.31 | backward_microstep: 390475.69 | backward: 390466.41 | backward_inner_microstep: 390450.74 | backward_inner: 390444.76 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 357.23 | _step_clipping: 0.15 | _step_step: 355.49 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.379 | iteration 10860/ 143000 | elapsed time per iteration (ms): 62519.1 | learning rate: 5.936E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.502814E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 20:31:35,710] [INFO] [logging.py:60:log_dist] [Rank 0] step=10870, skipped=6, lr=[0.0005935796973231973, 0.0005935796973231973], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10870 loss: 2.4926 iter time (s): 63.432 samples/sec: 16.143 %comms: 0.0028494723882819843 %optimizer_step 0.0574997600767252 %forward: 22.951385459078917 %backward: 61.534014826501945 [2025-04-01 20:31:35,710] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28006.90 | forward: 145585.05 | backward_microstep: 390331.21 | backward: 390322.08 | backward_inner_microstep: 390306.34 | backward_inner: 390298.68 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.19 | step: 364.73 | _step_clipping: 0.11 | _step_step: 362.96 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.143 | iteration 10870/ 143000 | elapsed time per iteration (ms): 63432.5 | learning rate: 5.936E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.497841E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 20:42:06,894] [INFO] [logging.py:60:log_dist] [Rank 0] step=10880, skipped=6, lr=[0.0005935661280085579, 0.0005935661280085579], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10880 loss: 2.4774 iter time (s): 63.118 samples/sec: 16.224 %comms: 0.002874758565608789 %optimizer_step 0.057471336181042484 %forward: 23.09409883281017 %backward: 61.854062960174794 [2025-04-01 20:42:06,894] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24566.50 | forward: 145764.89 | backward_microstep: 390417.38 | backward: 390409.29 | backward_inner_microstep: 390393.37 | backward_inner: 390387.39 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.62 | reduce_tied_grads: 0.29 | comms: 18.14 | reduce_grads: 0.19 | step: 362.75 | _step_clipping: 0.13 | _step_step: 361.04 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.223 | iteration 10880/ 143000 | elapsed time per iteration (ms): 63118.4 | learning rate: 5.936E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.488112E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 20:52:35,307] [INFO] [logging.py:60:log_dist] [Rank 0] step=10890, skipped=6, lr=[0.0005935525445251008, 0.0005935525445251008], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10890 loss: 2.4950 iter time (s): 62.841 samples/sec: 16.295 %comms: 0.004549664535607192 %optimizer_step 0.05816841547128152 %forward: 23.1432818347265 %backward: 62.10609285624891 [2025-04-01 20:52:35,307] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22268.30 | forward: 145434.16 | backward_microstep: 390287.07 | backward: 390279.45 | backward_inner_microstep: 390263.79 | backward_inner: 390257.74 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 28.59 | reduce_grads: 0.19 | step: 365.53 | _step_clipping: 0.14 | _step_step: 363.80 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.295 | iteration 10890/ 143000 | elapsed time per iteration (ms): 62841.3 | learning rate: 5.936E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.490918E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 21:03:04,839] [INFO] [logging.py:60:log_dist] [Rank 0] step=10900, skipped=6, lr=[0.0005935389468734816, 0.0005935389468734816], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10900 loss: 2.4770 iter time (s): 62.953 samples/sec: 16.266 %comms: 0.0028517396381555664 %optimizer_step 0.05839923303966357 %forward: 23.07620280781676 %backward: 61.97777987024524 [2025-04-01 21:03:04,839] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23693.03 | forward: 145270.69 | backward_microstep: 390174.22 | backward: 390166.23 | backward_inner_microstep: 390148.89 | backward_inner: 390142.71 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 367.64 | _step_clipping: 0.12 | _step_step: 365.93 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.266 | iteration 10900/ 143000 | elapsed time per iteration (ms): 62953.2 | learning rate: 5.935E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.481515E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 21:13:29,558] [INFO] [logging.py:60:log_dist] [Rank 0] step=10910, skipped=6, lr=[0.0005935253350543566, 0.0005935253350543566], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10910 loss: 2.5010 iter time (s): 62.471 samples/sec: 16.392 %comms: 0.002873020919987699 %optimizer_step 0.05530599618994758 %forward: 23.280832683562007 %backward: 62.46518122765795 [2025-04-01 21:13:29,558] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18644.18 | forward: 145438.51 | backward_microstep: 390236.73 | backward: 390228.44 | backward_inner_microstep: 390212.51 | backward_inner: 390206.53 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 345.50 | _step_clipping: 0.14 | _step_step: 343.79 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.391 | iteration 10910/ 143000 | elapsed time per iteration (ms): 62471.9 | learning rate: 5.935E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.493387E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 21:24:00,123] [INFO] [logging.py:60:log_dist] [Rank 0] step=10920, skipped=6, lr=[0.0005935117090683827, 0.0005935117090683827], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10920 loss: 2.4921 iter time (s): 63.056 samples/sec: 16.240 %comms: 0.002866308722039508 %optimizer_step 0.05623344788431197 %forward: 23.077384631008716 %backward: 61.9028594910536 [2025-04-01 21:24:00,124] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24276.52 | forward: 145516.76 | backward_microstep: 390342.47 | backward: 390334.69 | backward_inner_microstep: 390318.83 | backward_inner: 390312.61 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.60 | reduce_tied_grads: 0.32 | comms: 18.07 | reduce_grads: 0.20 | step: 354.59 | _step_clipping: 0.15 | _step_step: 352.81 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.239 | iteration 10920/ 143000 | elapsed time per iteration (ms): 63056.6 | learning rate: 5.935E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.492777E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 21:34:30,279] [INFO] [logging.py:60:log_dist] [Rank 0] step=10930, skipped=6, lr=[0.0005934980689162175, 0.0005934980689162175], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10930 loss: 2.4920 iter time (s): 63.015 samples/sec: 16.250 %comms: 0.0028525475820999633 %optimizer_step 0.0573153440671752 %forward: 23.067383566904095 %backward: 61.93228831672366 [2025-04-01 21:34:30,280] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24091.11 | forward: 145359.13 | backward_microstep: 390274.39 | backward: 390266.35 | backward_inner_microstep: 390250.55 | backward_inner: 390244.47 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.19 | step: 361.17 | _step_clipping: 0.14 | _step_step: 359.51 | _step_zero_grad: 0.49 | _step_check_overflow: 0.48 samples/sec: 16.250 | iteration 10930/ 143000 | elapsed time per iteration (ms): 63015.6 | learning rate: 5.935E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.486957E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 21:44:59,627] [INFO] [logging.py:60:log_dist] [Rank 0] step=10940, skipped=6, lr=[0.0005934844145985195, 0.0005934844145985195], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10940 loss: 2.4915 iter time (s): 62.934 samples/sec: 16.271 %comms: 0.002866436823722177 %optimizer_step 0.05615831770635174 %forward: 23.112965756215377 %backward: 62.02615690883204 [2025-04-01 21:44:59,628] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23076.97 | forward: 145459.70 | backward_microstep: 390365.34 | backward: 390356.93 | backward_inner_microstep: 390340.85 | backward_inner: 390334.77 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.68 | reduce_tied_grads: 0.28 | comms: 18.04 | reduce_grads: 0.20 | step: 353.43 | _step_clipping: 0.14 | _step_step: 351.67 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.271 | iteration 10940/ 143000 | elapsed time per iteration (ms): 62934.8 | learning rate: 5.935E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.494679E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 21:55:30,017] [INFO] [logging.py:60:log_dist] [Rank 0] step=10950, skipped=6, lr=[0.0005934707461159476, 0.0005934707461159476], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10950 loss: 2.4924 iter time (s): 63.038 samples/sec: 16.244 %comms: 0.002850618924697132 %optimizer_step 0.05744898600700076 %forward: 23.09560806132052 %backward: 61.9414745192894 [2025-04-01 21:55:30,018] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23854.62 | forward: 145591.03 | backward_microstep: 390477.02 | backward: 390469.17 | backward_inner_microstep: 390453.26 | backward_inner: 390445.38 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.19 | step: 362.15 | _step_clipping: 0.13 | _step_step: 360.39 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.244 | iteration 10950/ 143000 | elapsed time per iteration (ms): 63039.0 | learning rate: 5.935E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.491832E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 22:06:00,253] [INFO] [logging.py:60:log_dist] [Rank 0] step=10960, skipped=6, lr=[0.0005934570634691615, 0.0005934570634691615], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10960 loss: 2.5054 iter time (s): 63.023 samples/sec: 16.248 %comms: 0.0028368610776540325 %optimizer_step 0.0555701926903868 %forward: 23.09377911676531 %backward: 61.94608560975622 [2025-04-01 22:06:00,254] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23835.56 | forward: 145544.10 | backward_microstep: 390411.45 | backward: 390403.28 | backward_inner_microstep: 390387.44 | backward_inner: 390381.40 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.63 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 350.22 | _step_clipping: 0.12 | _step_step: 348.57 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.248 | iteration 10960/ 143000 | elapsed time per iteration (ms): 63023.6 | learning rate: 5.935E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.489396E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 22:16:29,320] [INFO] [logging.py:60:log_dist] [Rank 0] step=10970, skipped=6, lr=[0.0005934433666588217, 0.0005934433666588217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10970 loss: 2.5035 iter time (s): 62.906 samples/sec: 16.278 %comms: 0.0028439556954644152 %optimizer_step 0.055859963774674205 %forward: 23.132366436075973 %backward: 62.0551762660162 [2025-04-01 22:16:29,320] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22715.92 | forward: 145516.69 | backward_microstep: 390372.50 | backward: 390364.89 | backward_inner_microstep: 390348.74 | backward_inner: 390342.77 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.68 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 351.39 | _step_clipping: 0.13 | _step_step: 349.64 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.278 | iteration 10970/ 143000 | elapsed time per iteration (ms): 62906.6 | learning rate: 5.934E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.497210E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 22:26:55,373] [INFO] [logging.py:60:log_dist] [Rank 0] step=10980, skipped=6, lr=[0.0005934296556855891, 0.0005934296556855891], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10980 loss: 2.4945 iter time (s): 62.605 samples/sec: 16.357 %comms: 0.0028538329410479337 %optimizer_step 0.05610561604728425 %forward: 23.23412414614296 %backward: 62.35471638413285 [2025-04-01 22:26:55,374] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19790.07 | forward: 145456.84 | backward_microstep: 390377.39 | backward: 390370.65 | backward_inner_microstep: 390355.56 | backward_inner: 390349.84 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.52 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 351.25 | _step_clipping: 0.12 | _step_step: 349.62 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.356 | iteration 10980/ 143000 | elapsed time per iteration (ms): 62605.4 | learning rate: 5.934E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.495412E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 22:37:24,351] [INFO] [logging.py:60:log_dist] [Rank 0] step=10990, skipped=6, lr=[0.0005934159305501256, 0.0005934159305501256], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 10990 loss: 2.4869 iter time (s): 62.897 samples/sec: 16.281 %comms: 0.0028646737643310475 %optimizer_step 0.05697593632265497 %forward: 23.104439631166482 %backward: 62.06462895816933 [2025-04-01 22:37:24,352] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22833.34 | forward: 145320.55 | backward_microstep: 390376.22 | backward: 390369.39 | backward_inner_microstep: 390353.79 | backward_inner: 390347.93 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.20 | step: 358.36 | _step_clipping: 0.14 | _step_step: 356.61 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.280 | iteration 10990/ 143000 | elapsed time per iteration (ms): 62897.8 | learning rate: 5.934E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.490598E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 22:47:51,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=11000, skipped=6, lr=[0.0005934021912530935, 0.0005934021912530935], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11000 loss: 2.4906 iter time (s): 62.762 samples/sec: 16.316 %comms: 0.002855014440545354 %optimizer_step 0.055779679385031325 %forward: 23.169690199680602 %backward: 62.200936520332306 [2025-04-01 22:47:51,976] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21332.01 | forward: 145417.16 | backward_microstep: 390392.71 | backward: 390384.32 | backward_inner_microstep: 390368.49 | backward_inner: 390362.51 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.72 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 350.08 | _step_clipping: 0.11 | _step_step: 348.40 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.316 | iteration 11000/ 143000 | elapsed time per iteration (ms): 62762.4 | learning rate: 5.934E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.487947E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 22:47:54,965] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step11000/mp_rank_00_model_states.pt [2025-04-01 22:48:08,972] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-01 22:48:08,977] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step11000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-01 22:58:37,645] [INFO] [logging.py:60:log_dist] [Rank 0] step=11010, skipped=6, lr=[0.000593388437795156, 0.000593388437795156], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11010 loss: 2.4993 iter time (s): 62.865 samples/sec: 16.289 %comms: 0.002895743677650228 %optimizer_step 0.05679046607526962 %forward: 23.166014918089108 %backward: 62.09899220250531 [2025-04-01 22:58:37,646] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22143.88 | forward: 145634.11 | backward_microstep: 390400.53 | backward: 390387.88 | backward_inner_microstep: 390369.91 | backward_inner: 390363.77 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.63 | reduce_tied_grads: 0.30 | comms: 18.20 | reduce_grads: 0.19 | step: 357.02 | _step_clipping: 0.13 | _step_step: 355.22 | _step_zero_grad: 0.54 | _step_check_overflow: 0.52 samples/sec: 15.859 | iteration 11010/ 143000 | elapsed time per iteration (ms): 64567.0 | learning rate: 5.934E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.484105E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 23:09:02,534] [INFO] [logging.py:60:log_dist] [Rank 0] step=11020, skipped=6, lr=[0.000593374670176977, 0.000593374670176977], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11020 loss: 2.4921 iter time (s): 62.488 samples/sec: 16.387 %comms: 0.0028545368571581023 %optimizer_step 0.05550455690414052 %forward: 23.279621188460496 %backward: 62.468930065502505 [2025-04-01 23:09:02,534] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18620.25 | forward: 145470.47 | backward_microstep: 390364.54 | backward: 390357.92 | backward_inner_microstep: 390342.74 | backward_inner: 390337.13 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.54 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.18 | step: 346.84 | _step_clipping: 0.12 | _step_step: 345.25 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.387 | iteration 11020/ 143000 | elapsed time per iteration (ms): 62488.8 | learning rate: 5.934E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.492144E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 23:19:28,742] [INFO] [logging.py:60:log_dist] [Rank 0] step=11030, skipped=6, lr=[0.0005933608883992207, 0.0005933608883992207], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11030 loss: 2.4863 iter time (s): 62.620 samples/sec: 16.353 %comms: 0.0028647415850947726 %optimizer_step 0.055425146353031134 %forward: 23.21334947648201 %backward: 62.33733349113106 [2025-04-01 23:19:28,742] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20026.74 | forward: 145362.63 | backward_microstep: 390365.09 | backward: 390358.10 | backward_inner_microstep: 390343.09 | backward_inner: 390337.41 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.18 | step: 347.07 | _step_clipping: 0.14 | _step_step: 345.38 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.352 | iteration 11030/ 143000 | elapsed time per iteration (ms): 62620.8 | learning rate: 5.934E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.491287E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 23:22:34,341] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 1048576.0 [2025-04-01 23:23:40,512] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 524288.0 [2025-04-01 23:26:47,382] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-01 23:29:58,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=11040, skipped=9, lr=[0.0005933512327301993, 0.0005933512327301993], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11040 loss: 2.5139 iter time (s): 62.943 samples/sec: 16.269 %comms: 0.002046206756895089 %optimizer_step 0.04232189662505813 %forward: 23.11002373315369 %backward: 62.019762447433365 [2025-04-01 23:29:58,175] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23236.78 | forward: 145460.66 | backward_microstep: 390375.71 | backward: 390368.94 | backward_inner_microstep: 390353.87 | backward_inner: 390348.08 | backward_allreduce_microstep: 7.22 | backward_allreduce: 2.48 | reduce_tied_grads: 0.30 | comms: 12.88 | reduce_grads: 0.19 | step: 266.39 | _step_clipping: 0.13 | _step_step: 264.70 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.269 | iteration 11040/ 143000 | elapsed time per iteration (ms): 62943.2 | learning rate: 5.934E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.512380E+00 | loss scale: 262144.0 | number of skipped iterations: 3 | number of nan iterations: 0 | time (ms) [2025-04-01 23:40:27,724] [INFO] [logging.py:60:log_dist] [Rank 0] step=11050, skipped=9, lr=[0.0005933374268826888, 0.0005933374268826888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11050 loss: 2.5043 iter time (s): 62.954 samples/sec: 16.266 %comms: 0.0028864238315827926 %optimizer_step 0.05694154211926398 %forward: 23.114545364673333 %backward: 62.00668301417359 [2025-04-01 23:40:27,724] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23242.91 | forward: 145516.25 | backward_microstep: 390367.89 | backward: 390359.40 | backward_inner_microstep: 390343.93 | backward_inner: 390337.88 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.58 | reduce_tied_grads: 0.27 | comms: 18.17 | reduce_grads: 0.19 | step: 358.47 | _step_clipping: 0.13 | _step_step: 356.82 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.266 | iteration 11050/ 143000 | elapsed time per iteration (ms): 62954.9 | learning rate: 5.933E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.502162E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-01 23:50:57,388] [INFO] [logging.py:60:log_dist] [Rank 0] step=11060, skipped=9, lr=[0.0005933236068773989, 0.0005933236068773989], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11060 loss: 2.5368 iter time (s): 62.966 samples/sec: 16.263 %comms: 0.002867798838949046 %optimizer_step 0.05586198603471024 %forward: 23.141967367489748 %backward: 62.00964143761324 [2025-04-01 23:50:57,389] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23069.97 | forward: 145715.42 | backward_microstep: 390457.39 | backward: 390449.13 | backward_inner_microstep: 390433.53 | backward_inner: 390427.63 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.19 | step: 351.74 | _step_clipping: 0.12 | _step_step: 350.07 | _step_zero_grad: 0.51 | _step_check_overflow: 0.49 samples/sec: 16.263 | iteration 11060/ 143000 | elapsed time per iteration (ms): 62966.4 | learning rate: 5.933E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.506390E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 00:01:22,135] [INFO] [logging.py:60:log_dist] [Rank 0] step=11070, skipped=9, lr=[0.0005933097727149964, 0.0005933097727149964], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11070 loss: 2.5038 iter time (s): 62.474 samples/sec: 16.391 %comms: 0.002879536369777581 %optimizer_step 0.05609470723450449 %forward: 23.304729281054517 %backward: 62.500058889980124 [2025-04-02 00:01:22,135] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18275.76 | forward: 145594.13 | backward_microstep: 390474.41 | backward: 390463.30 | backward_inner_microstep: 390443.92 | backward_inner: 390437.84 | backward_allreduce_microstep: 9.25 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.19 | step: 350.45 | _step_clipping: 0.14 | _step_step: 348.74 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.391 | iteration 11070/ 143000 | elapsed time per iteration (ms): 62474.6 | learning rate: 5.933E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.503776E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 00:11:55,077] [INFO] [logging.py:60:log_dist] [Rank 0] step=11080, skipped=9, lr=[0.0005932959243961489, 0.0005932959243961489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11080 loss: 2.5123 iter time (s): 63.294 samples/sec: 16.179 %comms: 0.0028176492045047332 %optimizer_step 0.05461303362946877 %forward: 22.982762973264308 %backward: 61.66605795773064 [2025-04-02 00:11:55,077] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26742.22 | forward: 145466.44 | backward_microstep: 390314.74 | backward: 390307.38 | backward_inner_microstep: 390291.65 | backward_inner: 390285.66 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.60 | reduce_tied_grads: 0.26 | comms: 17.83 | reduce_grads: 0.18 | step: 345.67 | _step_clipping: 0.13 | _step_step: 344.02 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.178 | iteration 11080/ 143000 | elapsed time per iteration (ms): 63294.2 | learning rate: 5.933E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.497457E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 00:22:18,631] [INFO] [logging.py:60:log_dist] [Rank 0] step=11090, skipped=9, lr=[0.0005932820619215252, 0.0005932820619215252], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11090 loss: 2.4886 iter time (s): 62.355 samples/sec: 16.422 %comms: 0.002870815456736387 %optimizer_step 0.05525843719316551 %forward: 23.30088276825874 %backward: 62.59966042530196 [2025-04-02 00:22:18,632] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17518.66 | forward: 145292.44 | backward_microstep: 390348.62 | backward: 390339.62 | backward_inner_microstep: 390320.71 | backward_inner: 390314.72 | backward_allreduce_microstep: 10.96 | backward_allreduce: 2.55 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.20 | step: 344.56 | _step_clipping: 0.11 | _step_step: 342.82 | _step_zero_grad: 0.48 | _step_check_overflow: 0.61 samples/sec: 16.422 | iteration 11090/ 143000 | elapsed time per iteration (ms): 62355.4 | learning rate: 5.933E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.492345E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 00:32:47,302] [INFO] [logging.py:60:log_dist] [Rank 0] step=11100, skipped=9, lr=[0.0005932681852917938, 0.0005932681852917938], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11100 loss: 2.4574 iter time (s): 62.866 samples/sec: 16.288 %comms: 0.0028477582532929297 %optimizer_step 0.0551790043249514 %forward: 23.129799912132416 %backward: 62.08174517258016 [2025-04-02 00:32:47,302] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22568.40 | forward: 145408.89 | backward_microstep: 390292.63 | backward: 390286.02 | backward_inner_microstep: 390268.72 | backward_inner: 390262.86 | backward_allreduce_microstep: 9.29 | backward_allreduce: 2.76 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.19 | step: 346.89 | _step_clipping: 0.13 | _step_step: 345.22 | _step_zero_grad: 0.45 | _step_check_overflow: 0.55 samples/sec: 16.288 | iteration 11100/ 143000 | elapsed time per iteration (ms): 62867.0 | learning rate: 5.933E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.482221E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 00:43:07,274] [INFO] [logging.py:60:log_dist] [Rank 0] step=11110, skipped=9, lr=[0.0005932542945076248, 0.0005932542945076248], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11110 loss: 2.4917 iter time (s): 61.997 samples/sec: 16.517 %comms: 0.0031494079839820945 %optimizer_step 0.05682413429698641 %forward: 23.433469239930847 %backward: 62.958289162977685 [2025-04-02 00:43:07,275] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13967.61 | forward: 145279.77 | backward_microstep: 390326.59 | backward: 390320.60 | backward_inner_microstep: 390305.27 | backward_inner: 390299.81 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.55 | reduce_tied_grads: 0.24 | comms: 19.53 | reduce_grads: 4.11 | step: 352.29 | _step_clipping: 0.11 | _step_step: 350.64 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.517 | iteration 11110/ 143000 | elapsed time per iteration (ms): 61997.3 | learning rate: 5.933E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.485122E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 00:53:28,397] [INFO] [logging.py:60:log_dist] [Rank 0] step=11120, skipped=9, lr=[0.0005932403895696886, 0.0005932403895696886], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11120 loss: 2.5083 iter time (s): 62.112 samples/sec: 16.486 %comms: 0.0028921897512439827 %optimizer_step 0.05746351041965079 %forward: 23.40119335220716 %backward: 62.844993564342325 [2025-04-02 00:53:28,398] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15012.98 | forward: 145348.82 | backward_microstep: 390348.03 | backward: 390341.03 | backward_inner_microstep: 390325.61 | backward_inner: 390319.82 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.57 | reduce_tied_grads: 0.27 | comms: 17.96 | reduce_grads: 0.19 | step: 356.92 | _step_clipping: 0.11 | _step_step: 355.11 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.486 | iteration 11120/ 143000 | elapsed time per iteration (ms): 62112.3 | learning rate: 5.932E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.489918E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 01:03:57,251] [INFO] [logging.py:60:log_dist] [Rank 0] step=11130, skipped=9, lr=[0.0005932264704786562, 0.0005932264704786562], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11130 loss: 2.4872 iter time (s): 62.885 samples/sec: 16.284 %comms: 0.0028507187227190785 %optimizer_step 0.06071109580482713 %forward: 23.14379539344934 %backward: 62.08317777725727 [2025-04-02 01:03:57,252] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22431.14 | forward: 145539.34 | backward_microstep: 390416.77 | backward: 390408.94 | backward_inner_microstep: 390391.67 | backward_inner: 390385.69 | backward_allreduce_microstep: 9.13 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 381.78 | _step_clipping: 0.12 | _step_step: 379.99 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.284 | iteration 11130/ 143000 | elapsed time per iteration (ms): 62885.4 | learning rate: 5.932E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.482284E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 01:14:26,751] [INFO] [logging.py:60:log_dist] [Rank 0] step=11140, skipped=9, lr=[0.0005932125372351995, 0.0005932125372351995], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11140 loss: 2.4763 iter time (s): 62.949 samples/sec: 16.267 %comms: 0.0028617310890493666 %optimizer_step 0.05559802126100384 %forward: 23.09571831701603 %backward: 62.0064012301485 [2025-04-02 01:14:26,751] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23374.64 | forward: 145386.21 | backward_microstep: 390333.55 | backward: 390326.69 | backward_inner_microstep: 390311.25 | backward_inner: 390305.37 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.18 | step: 349.99 | _step_clipping: 0.13 | _step_step: 348.28 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.267 | iteration 11140/ 143000 | elapsed time per iteration (ms): 62950.0 | learning rate: 5.932E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.485580E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 01:24:49,567] [INFO] [logging.py:60:log_dist] [Rank 0] step=11150, skipped=9, lr=[0.000593198589839991, 0.000593198589839991], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11150 loss: 2.4944 iter time (s): 62.281 samples/sec: 16.442 %comms: 0.0028852814057741785 %optimizer_step 0.05596315104833535 %forward: 23.421286071064927 %backward: 62.70064902114293 [2025-04-02 01:24:49,568] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15989.96 | forward: 145870.32 | backward_microstep: 390514.31 | backward: 390506.47 | backward_inner_microstep: 390490.42 | backward_inner: 390484.21 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.19 | step: 348.54 | _step_clipping: 0.13 | _step_step: 346.80 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.441 | iteration 11150/ 143000 | elapsed time per iteration (ms): 62281.7 | learning rate: 5.932E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.485248E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 01:35:14,959] [INFO] [logging.py:60:log_dist] [Rank 0] step=11160, skipped=9, lr=[0.0005931846282937038, 0.0005931846282937038], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11160 loss: 2.4907 iter time (s): 62.539 samples/sec: 16.374 %comms: 0.002854262567170088 %optimizer_step 0.05513558136052974 %forward: 23.262770600486345 %backward: 62.418932799502734 [2025-04-02 01:35:14,959] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19156.31 | forward: 145482.13 | backward_microstep: 390366.47 | backward: 390359.31 | backward_inner_microstep: 390343.84 | backward_inner: 390337.99 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.85 | reduce_grads: 0.18 | step: 344.81 | _step_clipping: 0.11 | _step_step: 343.22 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.374 | iteration 11160/ 143000 | elapsed time per iteration (ms): 62539.1 | learning rate: 5.932E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.499907E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 01:45:38,863] [INFO] [logging.py:60:log_dist] [Rank 0] step=11170, skipped=9, lr=[0.0005931706525970118, 0.0005931706525970118], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11170 loss: 2.4859 iter time (s): 62.390 samples/sec: 16.413 %comms: 0.0028713450986512135 %optimizer_step 0.05636511800797299 %forward: 23.27873282371932 %backward: 62.539661366541225 [2025-04-02 01:45:38,864] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18091.84 | forward: 145235.80 | backward_microstep: 390193.64 | backward: 390184.36 | backward_inner_microstep: 390168.82 | backward_inner: 390162.68 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 351.66 | _step_clipping: 0.12 | _step_step: 349.77 | _step_zero_grad: 0.47 | _step_check_overflow: 0.76 samples/sec: 16.413 | iteration 11170/ 143000 | elapsed time per iteration (ms): 62390.5 | learning rate: 5.932E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.490604E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 01:56:02,603] [INFO] [logging.py:60:log_dist] [Rank 0] step=11180, skipped=9, lr=[0.0005931566627505894, 0.0005931566627505894], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11180 loss: 2.4921 iter time (s): 62.373 samples/sec: 16.417 %comms: 0.002893166385109326 %optimizer_step 0.05623940241971503 %forward: 23.288061934910964 %backward: 62.54339627259593 [2025-04-02 01:56:02,603] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18004.69 | forward: 145255.57 | backward_microstep: 390111.41 | backward: 390104.46 | backward_inner_microstep: 390089.25 | backward_inner: 390083.47 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.52 | reduce_tied_grads: 0.28 | comms: 18.05 | reduce_grads: 0.18 | step: 350.78 | _step_clipping: 0.10 | _step_step: 349.13 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.417 | iteration 11180/ 143000 | elapsed time per iteration (ms): 62373.9 | learning rate: 5.932E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.492435E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 02:06:29,934] [INFO] [logging.py:60:log_dist] [Rank 0] step=11190, skipped=9, lr=[0.0005931426587551118, 0.0005931426587551118], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11190 loss: 2.4688 iter time (s): 62.733 samples/sec: 16.323 %comms: 0.002871621018412402 %optimizer_step 0.056045294929114975 %forward: 23.191125186186568 %backward: 62.22846346831281 [2025-04-02 02:06:29,935] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21037.83 | forward: 145484.00 | backward_microstep: 390384.96 | backward: 390375.45 | backward_inner_microstep: 390359.65 | backward_inner: 390353.70 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.62 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.18 | step: 351.59 | _step_clipping: 0.14 | _step_step: 349.89 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.323 | iteration 11190/ 143000 | elapsed time per iteration (ms): 62733.2 | learning rate: 5.931E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.486430E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 02:16:59,204] [INFO] [logging.py:60:log_dist] [Rank 0] step=11200, skipped=9, lr=[0.0005931286406112552, 0.0005931286406112552], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11200 loss: 2.4955 iter time (s): 62.926 samples/sec: 16.273 %comms: 0.0028385268955189715 %optimizer_step 0.05593771189139331 %forward: 23.11935694304152 %backward: 62.01672883275982 [2025-04-02 02:16:59,205] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23110.51 | forward: 145481.91 | backward_microstep: 390256.42 | backward: 390249.28 | backward_inner_microstep: 390233.53 | backward_inner: 390227.59 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.58 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.18 | step: 352.00 | _step_clipping: 0.12 | _step_step: 350.30 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.273 | iteration 11200/ 143000 | elapsed time per iteration (ms): 62927.0 | learning rate: 5.931E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.485817E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 02:27:23,699] [INFO] [logging.py:60:log_dist] [Rank 0] step=11210, skipped=9, lr=[0.0005931146083196958, 0.0005931146083196958], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11210 loss: 2.4912 iter time (s): 62.449 samples/sec: 16.397 %comms: 0.0028658424718278866 %optimizer_step 0.056349873374125746 %forward: 23.259093853038927 %backward: 62.489836968194744 [2025-04-02 02:27:23,700] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18609.39 | forward: 145250.64 | backward_microstep: 390250.39 | backward: 390242.59 | backward_inner_microstep: 390226.90 | backward_inner: 390220.96 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.60 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.18 | step: 351.90 | _step_clipping: 0.12 | _step_step: 350.20 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.397 | iteration 11210/ 143000 | elapsed time per iteration (ms): 62449.5 | learning rate: 5.931E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.490757E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 02:37:54,062] [INFO] [logging.py:60:log_dist] [Rank 0] step=11220, skipped=9, lr=[0.0005931005618811111, 0.0005931005618811111], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11220 loss: 2.4868 iter time (s): 63.036 samples/sec: 16.245 %comms: 0.0028586096935579145 %optimizer_step 0.05637458658318479 %forward: 23.069886045342212 %backward: 61.91034637871504 [2025-04-02 02:37:54,062] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24279.53 | forward: 145422.57 | backward_microstep: 390265.93 | backward: 390256.02 | backward_inner_microstep: 390237.72 | backward_inner: 390229.57 | backward_allreduce_microstep: 9.55 | backward_allreduce: 2.66 | reduce_tied_grads: 0.36 | comms: 18.02 | reduce_grads: 0.20 | step: 355.36 | _step_clipping: 0.14 | _step_step: 353.58 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.245 | iteration 11220/ 143000 | elapsed time per iteration (ms): 63036.2 | learning rate: 5.931E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.497251E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 02:48:22,697] [INFO] [logging.py:60:log_dist] [Rank 0] step=11230, skipped=9, lr=[0.0005930865012961791, 0.0005930865012961791], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11230 loss: 2.4763 iter time (s): 62.863 samples/sec: 16.289 %comms: 0.00285049566267358 %optimizer_step 0.055657522631852946 %forward: 23.120230256605744 %backward: 62.07943274773734 [2025-04-02 02:48:22,697] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22628.38 | forward: 145340.64 | backward_microstep: 390258.29 | backward: 390249.78 | backward_inner_microstep: 390228.84 | backward_inner: 390222.84 | backward_allreduce_microstep: 10.98 | backward_allreduce: 4.30 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.18 | step: 349.88 | _step_clipping: 0.11 | _step_step: 348.25 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.289 | iteration 11230/ 143000 | elapsed time per iteration (ms): 62863.5 | learning rate: 5.931E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.482398E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 02:58:54,278] [INFO] [logging.py:60:log_dist] [Rank 0] step=11240, skipped=9, lr=[0.0005930724265655782, 0.0005930724265655782], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11240 loss: 2.4932 iter time (s): 63.158 samples/sec: 16.213 %comms: 0.0031652470932846404 %optimizer_step 0.05981693379244939 %forward: 23.06020800156572 %backward: 61.7933649567067 [2025-04-02 02:58:54,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25247.05 | forward: 145642.59 | backward_microstep: 390280.62 | backward: 390271.67 | backward_inner_microstep: 390255.69 | backward_inner: 390249.33 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.64 | reduce_tied_grads: 2.16 | comms: 19.99 | reduce_grads: 0.19 | step: 377.79 | _step_clipping: 0.14 | _step_step: 376.09 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.213 | iteration 11240/ 143000 | elapsed time per iteration (ms): 63158.1 | learning rate: 5.931E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.485546E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 03:09:30,049] [INFO] [logging.py:60:log_dist] [Rank 0] step=11250, skipped=9, lr=[0.0005930583376899878, 0.0005930583376899878], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11250 loss: 2.5141 iter time (s): 63.577 samples/sec: 16.107 %comms: 0.0028506767583914004 %optimizer_step 0.05574090090040048 %forward: 22.89644071814894 %backward: 61.39510211004258 [2025-04-02 03:09:30,049] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29441.48 | forward: 145567.73 | backward_microstep: 390339.40 | backward: 390329.04 | backward_inner_microstep: 390312.64 | backward_inner: 390306.38 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.19 | step: 354.38 | _step_clipping: 0.13 | _step_step: 352.63 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.106 | iteration 11250/ 143000 | elapsed time per iteration (ms): 63577.1 | learning rate: 5.931E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.489503E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 03:19:59,809] [INFO] [logging.py:60:log_dist] [Rank 0] step=11260, skipped=9, lr=[0.000593044234670088, 0.000593044234670088], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11260 loss: 2.4914 iter time (s): 62.976 samples/sec: 16.260 %comms: 0.002852329954691253 %optimizer_step 0.05589458958782833 %forward: 23.10408365869787 %backward: 61.9758446568036 [2025-04-02 03:19:59,810] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23548.51 | forward: 145499.15 | backward_microstep: 390305.46 | backward: 390296.06 | backward_inner_microstep: 390279.86 | backward_inner: 390271.97 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.67 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.19 | step: 352.00 | _step_clipping: 0.14 | _step_step: 350.27 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.260 | iteration 11260/ 143000 | elapsed time per iteration (ms): 62976.1 | learning rate: 5.930E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.489061E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 03:30:27,658] [INFO] [logging.py:60:log_dist] [Rank 0] step=11270, skipped=9, lr=[0.0005930301175065593, 0.0005930301175065593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11270 loss: 2.4812 iter time (s): 62.784 samples/sec: 16.310 %comms: 0.0028630695991857763 %optimizer_step 0.05631313664899481 %forward: 23.15726169467657 %backward: 62.14688370781548 [2025-04-02 03:30:27,658] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21861.77 | forward: 145391.14 | backward_microstep: 390192.69 | backward: 390184.57 | backward_inner_microstep: 390168.82 | backward_inner: 390162.90 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.65 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.19 | step: 353.56 | _step_clipping: 0.14 | _step_step: 351.85 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.310 | iteration 11270/ 143000 | elapsed time per iteration (ms): 62784.8 | learning rate: 5.930E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.479999E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 03:40:52,936] [INFO] [logging.py:60:log_dist] [Rank 0] step=11280, skipped=9, lr=[0.0005930159862000831, 0.0005930159862000831], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11280 loss: 2.4774 iter time (s): 62.527 samples/sec: 16.377 %comms: 0.002881659825321891 %optimizer_step 0.05647922089374639 %forward: 23.2684486878887 %backward: 62.43705166747203 [2025-04-02 03:40:52,937] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18939.16 | forward: 145491.37 | backward_microstep: 390411.95 | backward: 390402.13 | backward_inner_microstep: 390385.76 | backward_inner: 390379.43 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.18 | step: 353.15 | _step_clipping: 0.12 | _step_step: 351.52 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.377 | iteration 11280/ 143000 | elapsed time per iteration (ms): 62527.8 | learning rate: 5.930E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.487316E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 03:51:16,896] [INFO] [logging.py:60:log_dist] [Rank 0] step=11290, skipped=9, lr=[0.0005930018407513415, 0.0005930018407513415], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11290 loss: 2.5057 iter time (s): 62.395 samples/sec: 16.411 %comms: 0.0028720822263553617 %optimizer_step 0.05604664189393469 %forward: 23.30486779283246 %backward: 62.539941254722685 [2025-04-02 03:51:16,897] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17949.74 | forward: 145411.83 | backward_microstep: 390228.30 | backward: 390220.95 | backward_inner_microstep: 390205.07 | backward_inner: 390199.05 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.19 | step: 349.71 | _step_clipping: 0.14 | _step_step: 348.01 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.411 | iteration 11290/ 143000 | elapsed time per iteration (ms): 62396.1 | learning rate: 5.930E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.507388E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 04:01:45,709] [INFO] [logging.py:60:log_dist] [Rank 0] step=11300, skipped=9, lr=[0.0005929876811610172, 0.0005929876811610172], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11300 loss: 2.4955 iter time (s): 62.881 samples/sec: 16.285 %comms: 0.0028652417268663715 %optimizer_step 0.056933067451630764 %forward: 23.122122409245467 %backward: 62.07740869952718 [2025-04-02 04:01:45,710] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22611.82 | forward: 145393.32 | backward_microstep: 390354.23 | backward: 390346.54 | backward_inner_microstep: 390330.62 | backward_inner: 390324.48 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.62 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 358.00 | _step_clipping: 0.13 | _step_step: 356.18 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 16.285 | iteration 11300/ 143000 | elapsed time per iteration (ms): 62881.2 | learning rate: 5.930E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.496359E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 04:12:15,871] [INFO] [logging.py:60:log_dist] [Rank 0] step=11310, skipped=9, lr=[0.0005929735074297938, 0.0005929735074297938], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11310 loss: 2.5022 iter time (s): 63.016 samples/sec: 16.250 %comms: 0.0028770384927890814 %optimizer_step 0.05801279758040798 %forward: 23.119159325402446 %backward: 61.93460137674271 [2025-04-02 04:12:15,871] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23767.38 | forward: 145686.73 | backward_microstep: 390292.87 | backward: 390284.50 | backward_inner_microstep: 390268.25 | backward_inner: 390262.04 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.66 | reduce_tied_grads: 0.33 | comms: 18.13 | reduce_grads: 0.21 | step: 365.57 | _step_clipping: 0.13 | _step_step: 363.90 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.250 | iteration 11310/ 143000 | elapsed time per iteration (ms): 63016.2 | learning rate: 5.930E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.495063E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 04:22:41,077] [INFO] [logging.py:60:log_dist] [Rank 0] step=11320, skipped=9, lr=[0.000592959319558355, 0.000592959319558355], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11320 loss: 2.4830 iter time (s): 62.519 samples/sec: 16.379 %comms: 0.002865475966115691 %optimizer_step 0.05603298295699899 %forward: 23.280125012822424 %backward: 62.4185752208936 [2025-04-02 04:22:41,078] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19014.16 | forward: 145545.85 | backward_microstep: 390245.02 | backward: 390236.93 | backward_inner_microstep: 390219.45 | backward_inner: 390211.70 | backward_allreduce_microstep: 9.22 | backward_allreduce: 2.55 | reduce_tied_grads: 0.31 | comms: 17.91 | reduce_grads: 0.18 | step: 350.31 | _step_clipping: 0.13 | _step_step: 348.62 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.379 | iteration 11320/ 143000 | elapsed time per iteration (ms): 62520.6 | learning rate: 5.930E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.481474E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 04:33:09,258] [INFO] [logging.py:60:log_dist] [Rank 0] step=11330, skipped=9, lr=[0.0005929451175473859, 0.0005929451175473859], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11330 loss: 2.4738 iter time (s): 62.818 samples/sec: 16.301 %comms: 0.0028599593427135342 %optimizer_step 0.05716123270058453 %forward: 23.142330769170318 %backward: 62.10557134961784 [2025-04-02 04:33:09,258] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22298.45 | forward: 145374.38 | backward_microstep: 390138.74 | backward: 390131.79 | backward_inner_microstep: 390116.03 | backward_inner: 390110.21 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 359.07 | _step_clipping: 0.11 | _step_step: 357.38 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.301 | iteration 11330/ 143000 | elapsed time per iteration (ms): 62818.1 | learning rate: 5.929E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.478301E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 04:43:39,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=11340, skipped=9, lr=[0.0005929309013975717, 0.0005929309013975717], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11340 loss: 2.4853 iter time (s): 63.058 samples/sec: 16.239 %comms: 0.002841827866118411 %optimizer_step 0.05556975438133023 %forward: 23.06903754308728 %backward: 61.884121485221634 [2025-04-02 04:43:39,845] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24485.97 | forward: 145468.89 | backward_microstep: 390237.89 | backward: 390229.31 | backward_inner_microstep: 390209.60 | backward_inner: 390201.71 | backward_allreduce_microstep: 11.49 | backward_allreduce: 2.57 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 350.41 | _step_clipping: 0.13 | _step_step: 348.78 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.239 | iteration 11340/ 143000 | elapsed time per iteration (ms): 63058.6 | learning rate: 5.929E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.487427E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 04:54:03,888] [INFO] [logging.py:60:log_dist] [Rank 0] step=11350, skipped=9, lr=[0.0005929166711095987, 0.0005929166711095987], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11350 loss: 2.4609 iter time (s): 62.404 samples/sec: 16.409 %comms: 0.0028799496008055743 %optimizer_step 0.05665534814253366 %forward: 23.297202490962317 %backward: 62.51918101163877 [2025-04-02 04:54:03,889] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18124.21 | forward: 145383.50 | backward_microstep: 390150.59 | backward: 390143.72 | backward_inner_microstep: 390128.25 | backward_inner: 390122.44 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 353.55 | _step_clipping: 0.14 | _step_step: 351.79 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.409 | iteration 11350/ 143000 | elapsed time per iteration (ms): 62404.4 | learning rate: 5.929E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.480155E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 05:04:32,676] [INFO] [logging.py:60:log_dist] [Rank 0] step=11360, skipped=9, lr=[0.0005929024266841537, 0.0005929024266841537], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11360 loss: 2.4816 iter time (s): 62.878 samples/sec: 16.285 %comms: 0.002873272927389332 %optimizer_step 0.05622783198101852 %forward: 23.144504067609866 %backward: 62.073479190596956 [2025-04-02 05:04:32,677] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22510.35 | forward: 145528.65 | backward_microstep: 390315.41 | backward: 390307.33 | backward_inner_microstep: 390291.86 | backward_inner: 390285.73 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.55 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.20 | step: 353.55 | _step_clipping: 0.14 | _step_step: 350.19 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.285 | iteration 11360/ 143000 | elapsed time per iteration (ms): 62878.8 | learning rate: 5.929E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.479162E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 05:14:57,239] [INFO] [logging.py:60:log_dist] [Rank 0] step=11370, skipped=9, lr=[0.0005928881681219241, 0.0005928881681219241], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11370 loss: 2.4996 iter time (s): 62.456 samples/sec: 16.396 %comms: 0.0029691002622991534 %optimizer_step 0.057040562502683155 %forward: 23.27372999501989 %backward: 62.49097010329277 [2025-04-02 05:14:57,239] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18503.70 | forward: 145357.68 | backward_microstep: 390299.00 | backward: 390291.66 | backward_inner_microstep: 390274.81 | backward_inner: 390269.09 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.49 | reduce_tied_grads: 0.28 | comms: 18.54 | reduce_grads: 0.18 | step: 356.25 | _step_clipping: 0.13 | _step_step: 354.51 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.395 | iteration 11370/ 143000 | elapsed time per iteration (ms): 62456.2 | learning rate: 5.929E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.484417E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 05:25:26,028] [INFO] [logging.py:60:log_dist] [Rank 0] step=11380, skipped=9, lr=[0.0005928738954235982, 0.0005928738954235982], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11380 loss: 2.4782 iter time (s): 62.878 samples/sec: 16.285 %comms: 0.0028650785038903965 %optimizer_step 0.0552126233336366 %forward: 23.095587179477857 %backward: 62.039478443795225 [2025-04-02 05:25:26,028] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23100.26 | forward: 145221.28 | backward_microstep: 390100.53 | backward: 390094.12 | backward_inner_microstep: 390077.10 | backward_inner: 390071.45 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.25 | comms: 18.02 | reduce_grads: 0.18 | step: 347.17 | _step_clipping: 0.12 | _step_step: 345.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.285 | iteration 11380/ 143000 | elapsed time per iteration (ms): 62878.9 | learning rate: 5.929E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.487870E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 05:35:49,676] [INFO] [logging.py:60:log_dist] [Rank 0] step=11390, skipped=9, lr=[0.0005928596085898649, 0.0005928596085898649], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11390 loss: 2.4853 iter time (s): 62.364 samples/sec: 16.420 %comms: 0.002924060585659103 %optimizer_step 0.0576089940228621 %forward: 23.281630461779436 %backward: 62.538943349078714 [2025-04-02 05:35:49,676] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18034.97 | forward: 145194.13 | backward_microstep: 390029.53 | backward: 390019.41 | backward_inner_microstep: 390003.73 | backward_inner: 389996.23 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 18.24 | reduce_grads: 0.20 | step: 359.27 | _step_clipping: 0.12 | _step_step: 357.54 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.420 | iteration 11390/ 143000 | elapsed time per iteration (ms): 62364.8 | learning rate: 5.929E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.489625E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 05:46:27,554] [INFO] [logging.py:60:log_dist] [Rank 0] step=11400, skipped=9, lr=[0.0005928453076214135, 0.0005928453076214135], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11400 loss: 2.4730 iter time (s): 63.787 samples/sec: 16.053 %comms: 0.0028313206997598456 %optimizer_step 0.058503607652015906 %forward: 22.80210681507377 %backward: 61.15569782752072 [2025-04-02 05:46:27,555] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31943.62 | forward: 145448.30 | backward_microstep: 390101.93 | backward: 390095.19 | backward_inner_microstep: 390079.65 | backward_inner: 390073.84 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.55 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.20 | step: 373.18 | _step_clipping: 0.14 | _step_step: 371.32 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.053 | iteration 11400/ 143000 | elapsed time per iteration (ms): 63787.8 | learning rate: 5.928E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.487803E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 05:57:02,582] [INFO] [logging.py:60:log_dist] [Rank 0] step=11410, skipped=9, lr=[0.0005928309925189345, 0.0005928309925189345], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11410 loss: 2.4890 iter time (s): 63.502 samples/sec: 16.125 %comms: 0.0028267589969379696 %optimizer_step 0.05707398746120669 %forward: 22.930873002398283 %backward: 61.46487559847588 [2025-04-02 05:57:02,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28646.26 | forward: 145616.03 | backward_microstep: 390324.51 | backward: 390315.34 | backward_inner_microstep: 390297.04 | backward_inner: 390290.74 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.66 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.20 | step: 362.43 | _step_clipping: 0.14 | _step_step: 360.74 | _step_zero_grad: 0.50 | _step_check_overflow: 0.48 samples/sec: 16.125 | iteration 11410/ 143000 | elapsed time per iteration (ms): 63502.7 | learning rate: 5.928E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.487522E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 06:07:27,739] [INFO] [logging.py:60:log_dist] [Rank 0] step=11420, skipped=9, lr=[0.0005928166632831187, 0.0005928166632831187], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11420 loss: 2.4705 iter time (s): 62.515 samples/sec: 16.380 %comms: 0.0028744384281875653 %optimizer_step 0.056750613385595935 %forward: 23.257985109673648 %backward: 62.40544397113381 [2025-04-02 06:07:27,740] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19245.11 | forward: 145397.75 | backward_microstep: 390136.63 | backward: 390128.86 | backward_inner_microstep: 390113.11 | backward_inner: 390107.15 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.58 | reduce_tied_grads: 0.33 | comms: 17.97 | reduce_grads: 0.19 | step: 354.78 | _step_clipping: 0.13 | _step_step: 352.98 | _step_zero_grad: 0.48 | _step_check_overflow: 0.63 samples/sec: 16.380 | iteration 11420/ 143000 | elapsed time per iteration (ms): 62515.8 | learning rate: 5.928E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.488481E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 06:17:56,269] [INFO] [logging.py:60:log_dist] [Rank 0] step=11430, skipped=9, lr=[0.0005928023199146576, 0.0005928023199146576], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11430 loss: 2.4650 iter time (s): 62.852 samples/sec: 16.292 %comms: 0.0028695625546861473 %optimizer_step 0.05618395456998146 %forward: 23.12276319082046 %backward: 62.06286088586057 [2025-04-02 06:17:56,269] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22688.31 | forward: 145332.11 | backward_microstep: 390087.89 | backward: 390079.96 | backward_inner_microstep: 390063.85 | backward_inner: 390057.96 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 18.04 | reduce_grads: 0.19 | step: 353.13 | _step_clipping: 0.13 | _step_step: 351.30 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.292 | iteration 11430/ 143000 | elapsed time per iteration (ms): 62853.0 | learning rate: 5.928E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.477383E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 06:28:26,909] [INFO] [logging.py:60:log_dist] [Rank 0] step=11440, skipped=9, lr=[0.0005927879624142438, 0.0005927879624142438], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11440 loss: 2.4619 iter time (s): 63.063 samples/sec: 16.238 %comms: 0.002874067481809988 %optimizer_step 0.05667131333221453 %forward: 23.095105625985525 %backward: 61.88401119878934 [2025-04-02 06:28:26,909] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24285.69 | forward: 145645.36 | backward_microstep: 390269.28 | backward: 390261.01 | backward_inner_microstep: 390242.02 | backward_inner: 390236.00 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.20 | step: 357.39 | _step_clipping: 0.13 | _step_step: 355.52 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.237 | iteration 11440/ 143000 | elapsed time per iteration (ms): 63064.0 | learning rate: 5.928E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.472355E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 06:38:56,889] [INFO] [logging.py:60:log_dist] [Rank 0] step=11450, skipped=9, lr=[0.0005927735907825698, 0.0005927735907825698], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11450 loss: 2.4816 iter time (s): 62.997 samples/sec: 16.255 %comms: 0.004320259160748803 %optimizer_step 0.059122046844433 %forward: 23.125723322381706 %backward: 61.955238241189846 [2025-04-02 06:38:56,890] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23535.10 | forward: 145685.64 | backward_microstep: 390309.04 | backward: 390300.82 | backward_inner_microstep: 390285.09 | backward_inner: 390279.08 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.76 | reduce_tied_grads: 0.29 | comms: 27.22 | reduce_grads: 0.22 | step: 372.45 | _step_clipping: 0.15 | _step_step: 370.54 | _step_zero_grad: 0.52 | _step_check_overflow: 0.65 samples/sec: 16.254 | iteration 11450/ 143000 | elapsed time per iteration (ms): 62998.0 | learning rate: 5.928E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.486588E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 06:49:25,876] [INFO] [logging.py:60:log_dist] [Rank 0] step=11460, skipped=9, lr=[0.0005927592050203296, 0.0005927592050203296], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11460 loss: 2.4982 iter time (s): 62.898 samples/sec: 16.280 %comms: 0.00284514876377096 %optimizer_step 0.055534435180731194 %forward: 23.124680712278252 %backward: 62.052703483542125 [2025-04-02 06:49:25,877] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22821.90 | forward: 145449.97 | backward_microstep: 390308.00 | backward: 390300.04 | backward_inner_microstep: 390284.31 | backward_inner: 390278.37 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.19 | step: 349.30 | _step_clipping: 0.12 | _step_step: 347.66 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.280 | iteration 11460/ 143000 | elapsed time per iteration (ms): 62898.7 | learning rate: 5.928E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.487821E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 06:59:55,270] [INFO] [logging.py:60:log_dist] [Rank 0] step=11470, skipped=9, lr=[0.0005927448051282174, 0.0005927448051282174], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11470 loss: 2.4684 iter time (s): 62.939 samples/sec: 16.270 %comms: 0.002847057232145803 %optimizer_step 0.05550519107178656 %forward: 23.082531965054 %backward: 61.995427902161616 [2025-04-02 06:59:55,271] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23477.80 | forward: 145278.91 | backward_microstep: 390200.68 | backward: 390192.37 | backward_inner_microstep: 390176.38 | backward_inner: 390170.30 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 349.34 | _step_clipping: 0.13 | _step_step: 347.66 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.270 | iteration 11470/ 143000 | elapsed time per iteration (ms): 62939.4 | learning rate: 5.927E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.476937E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 07:10:24,306] [INFO] [logging.py:60:log_dist] [Rank 0] step=11480, skipped=9, lr=[0.0005927303911069282, 0.0005927303911069282], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11480 loss: 2.4772 iter time (s): 62.903 samples/sec: 16.279 %comms: 0.002893331274733084 %optimizer_step 0.056420566298560015 %forward: 23.126075919902103 %backward: 62.03119880978352 [2025-04-02 07:10:24,307] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22950.72 | forward: 145469.95 | backward_microstep: 390203.15 | backward: 390194.83 | backward_inner_microstep: 390179.10 | backward_inner: 390171.36 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.19 | step: 354.90 | _step_clipping: 0.13 | _step_step: 353.07 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.279 | iteration 11480/ 143000 | elapsed time per iteration (ms): 62903.6 | learning rate: 5.927E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.482123E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 07:20:55,112] [INFO] [logging.py:60:log_dist] [Rank 0] step=11490, skipped=9, lr=[0.0005927159629571575, 0.0005927159629571575], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11490 loss: 2.4678 iter time (s): 63.080 samples/sec: 16.233 %comms: 0.0028657106332458373 %optimizer_step 0.056047207233003445 %forward: 23.071130539142555 %backward: 61.875883053310766 [2025-04-02 07:20:55,113] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24490.83 | forward: 145532.64 | backward_microstep: 390321.40 | backward: 390312.94 | backward_inner_microstep: 390293.58 | backward_inner: 390287.42 | backward_allreduce_microstep: 9.29 | backward_allreduce: 2.63 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.19 | step: 353.55 | _step_clipping: 0.14 | _step_step: 351.77 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.233 | iteration 11490/ 143000 | elapsed time per iteration (ms): 63080.6 | learning rate: 5.927E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.475184E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 07:31:28,135] [INFO] [logging.py:60:log_dist] [Rank 0] step=11500, skipped=9, lr=[0.000592701520679602, 0.000592701520679602], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11500 loss: 2.4826 iter time (s): 63.302 samples/sec: 16.177 %comms: 0.0028447133061158874 %optimizer_step 0.05913492652155699 %forward: 23.00834448922595 %backward: 61.66401495236966 [2025-04-02 07:31:28,135] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26599.57 | forward: 145646.72 | backward_microstep: 390352.30 | backward: 390343.65 | backward_inner_microstep: 390325.66 | backward_inner: 390317.50 | backward_allreduce_microstep: 9.50 | backward_allreduce: 4.45 | reduce_tied_grads: 0.34 | comms: 18.01 | reduce_grads: 0.21 | step: 374.33 | _step_clipping: 0.12 | _step_step: 372.56 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.176 | iteration 11500/ 143000 | elapsed time per iteration (ms): 63302.3 | learning rate: 5.927E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.481972E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 07:41:56,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=11510, skipped=9, lr=[0.0005926870642749587, 0.0005926870642749587], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11510 loss: 2.4831 iter time (s): 62.824 samples/sec: 16.299 %comms: 0.0028448216513216567 %optimizer_step 0.05532465027873712 %forward: 23.131258698713346 %backward: 62.108775657496636 [2025-04-02 07:41:56,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22332.09 | forward: 145320.03 | backward_microstep: 390200.89 | backward: 390192.74 | backward_inner_microstep: 390175.29 | backward_inner: 390169.25 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.53 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 347.57 | _step_clipping: 0.14 | _step_step: 345.98 | _step_zero_grad: 0.47 | _step_check_overflow: 0.44 samples/sec: 16.299 | iteration 11510/ 143000 | elapsed time per iteration (ms): 62824.6 | learning rate: 5.927E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.481228E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 07:52:19,273] [INFO] [logging.py:60:log_dist] [Rank 0] step=11520, skipped=9, lr=[0.000592672593743925, 0.000592672593743925], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11520 loss: 2.5004 iter time (s): 62.289 samples/sec: 16.440 %comms: 0.00287241301522964 %optimizer_step 0.055479705838236255 %forward: 23.315025679829375 %backward: 62.62442323526321 [2025-04-02 07:52:19,274] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17198.54 | forward: 145226.24 | backward_microstep: 390085.99 | backward: 390079.32 | backward_inner_microstep: 390063.68 | backward_inner: 390057.95 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.26 | comms: 17.89 | reduce_grads: 0.18 | step: 345.58 | _step_clipping: 0.12 | _step_step: 343.97 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.439 | iteration 11520/ 143000 | elapsed time per iteration (ms): 62289.2 | learning rate: 5.927E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.480691E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 08:02:44,426] [INFO] [logging.py:60:log_dist] [Rank 0] step=11530, skipped=9, lr=[0.0005926581090871997, 0.0005926581090871997], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11530 loss: 2.4784 iter time (s): 62.515 samples/sec: 16.380 %comms: 0.0028619486801321355 %optimizer_step 0.05747668749783825 %forward: 23.24744936880527 %backward: 62.414752490621886 [2025-04-02 08:02:44,427] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19207.44 | forward: 145330.90 | backward_microstep: 390196.57 | backward: 390184.40 | backward_inner_microstep: 390166.70 | backward_inner: 390160.68 | backward_allreduce_microstep: 9.39 | backward_allreduce: 2.69 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 359.31 | _step_clipping: 0.13 | _step_step: 357.60 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.380 | iteration 11530/ 143000 | elapsed time per iteration (ms): 62515.3 | learning rate: 5.927E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.479873E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 08:13:14,638] [INFO] [logging.py:60:log_dist] [Rank 0] step=11540, skipped=9, lr=[0.0005926436103054817, 0.0005926436103054817], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11540 loss: 2.4807 iter time (s): 63.021 samples/sec: 16.249 %comms: 0.0028788537092161908 %optimizer_step 0.058221048214999245 %forward: 23.110106501198477 %backward: 61.9365850133271 [2025-04-02 08:13:14,638] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23793.31 | forward: 145641.21 | backward_microstep: 390336.80 | backward: 390327.89 | backward_inner_microstep: 390310.19 | backward_inner: 390303.95 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.18 | step: 366.91 | _step_clipping: 0.11 | _step_step: 365.19 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.249 | iteration 11540/ 143000 | elapsed time per iteration (ms): 63021.1 | learning rate: 5.926E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.474446E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 08:23:45,064] [INFO] [logging.py:60:log_dist] [Rank 0] step=11550, skipped=9, lr=[0.0005926290973994708, 0.0005926290973994708], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11550 loss: 2.4741 iter time (s): 63.042 samples/sec: 16.243 %comms: 0.0028509070597638594 %optimizer_step 0.05742522453755991 %forward: 23.081192745010444 %backward: 61.907732449416976 [2025-04-02 08:23:45,064] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24218.98 | forward: 145508.61 | backward_microstep: 390286.88 | backward: 390279.15 | backward_inner_microstep: 390263.44 | backward_inner: 390257.43 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.64 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 362.02 | _step_clipping: 0.14 | _step_step: 360.24 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.243 | iteration 11550/ 143000 | elapsed time per iteration (ms): 63042.6 | learning rate: 5.926E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.477354E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 08:34:14,222] [INFO] [logging.py:60:log_dist] [Rank 0] step=11560, skipped=9, lr=[0.0005926145703698675, 0.0005926145703698675], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11560 loss: 2.4778 iter time (s): 62.915 samples/sec: 16.276 %comms: 0.002841264423581153 %optimizer_step 0.05536268738472256 %forward: 23.1280707046164 %backward: 62.04093463394161 [2025-04-02 08:34:14,223] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22894.84 | forward: 145511.03 | backward_microstep: 390340.62 | backward: 390332.61 | backward_inner_microstep: 390316.86 | backward_inner: 390310.76 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 348.32 | _step_clipping: 0.12 | _step_step: 346.65 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.276 | iteration 11560/ 143000 | elapsed time per iteration (ms): 62915.9 | learning rate: 5.926E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.472965E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 08:44:40,845] [INFO] [logging.py:60:log_dist] [Rank 0] step=11570, skipped=9, lr=[0.0005926000292173729, 0.0005926000292173729], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11570 loss: 2.4504 iter time (s): 62.662 samples/sec: 16.342 %comms: 0.002869125429299603 %optimizer_step 0.05639347512213309 %forward: 23.25124629659541 %backward: 62.30972887072125 [2025-04-02 08:44:40,845] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19981.29 | forward: 145696.28 | backward_microstep: 390452.05 | backward: 390443.40 | backward_inner_microstep: 390426.94 | backward_inner: 390420.68 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.69 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.19 | step: 353.37 | _step_clipping: 0.13 | _step_step: 351.69 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.342 | iteration 11570/ 143000 | elapsed time per iteration (ms): 62662.2 | learning rate: 5.926E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.466065E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 08:55:08,406] [INFO] [logging.py:60:log_dist] [Rank 0] step=11580, skipped=9, lr=[0.0005925854739426889, 0.0005925854739426889], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11580 loss: 2.4910 iter time (s): 62.756 samples/sec: 16.317 %comms: 0.0028493723246497656 %optimizer_step 0.05522417891524005 %forward: 23.1698699060862 %backward: 62.1980287733766 [2025-04-02 08:55:08,406] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21373.33 | forward: 145403.80 | backward_microstep: 390334.99 | backward: 390327.17 | backward_inner_microstep: 390309.67 | backward_inner: 390303.67 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 346.56 | _step_clipping: 0.13 | _step_step: 344.96 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.317 | iteration 11580/ 143000 | elapsed time per iteration (ms): 62756.1 | learning rate: 5.926E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.466902E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 09:05:37,658] [INFO] [logging.py:60:log_dist] [Rank 0] step=11590, skipped=9, lr=[0.0005925709045465177, 0.0005925709045465177], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11590 loss: 2.4737 iter time (s): 62.925 samples/sec: 16.273 %comms: 0.0028468684103638434 %optimizer_step 0.05526310525320481 %forward: 23.11713373502522 %backward: 62.044788242839644 [2025-04-02 09:05:37,658] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22903.33 | forward: 145463.74 | backward_microstep: 390422.71 | backward: 390414.63 | backward_inner_microstep: 390398.72 | backward_inner: 390392.73 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.19 | step: 347.74 | _step_clipping: 0.13 | _step_step: 346.09 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.273 | iteration 11590/ 143000 | elapsed time per iteration (ms): 62925.2 | learning rate: 5.926E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.477797E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 09:16:08,054] [INFO] [logging.py:60:log_dist] [Rank 0] step=11600, skipped=9, lr=[0.0005925563210295629, 0.0005925563210295629], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11600 loss: 2.4691 iter time (s): 63.039 samples/sec: 16.244 %comms: 0.002847980101938883 %optimizer_step 0.05631925847411899 %forward: 23.082016412513497 %backward: 61.92965881576586 [2025-04-02 09:16:08,054] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24027.96 | forward: 145506.84 | backward_microstep: 390407.50 | backward: 390398.68 | backward_inner_microstep: 390382.63 | backward_inner: 390376.45 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.67 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 355.03 | _step_clipping: 0.14 | _step_step: 353.34 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.244 | iteration 11600/ 143000 | elapsed time per iteration (ms): 63039.6 | learning rate: 5.926E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.487811E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 09:26:35,496] [INFO] [logging.py:60:log_dist] [Rank 0] step=11610, skipped=9, lr=[0.0005925417233925282, 0.0005925417233925282], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11610 loss: 2.4738 iter time (s): 62.744 samples/sec: 16.320 %comms: 0.002851845271338412 %optimizer_step 0.05467258430342615 %forward: 23.191783935723958 %backward: 62.21378527886324 [2025-04-02 09:26:35,497] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21144.44 | forward: 145514.00 | backward_microstep: 390360.21 | backward: 390352.75 | backward_inner_microstep: 390337.16 | backward_inner: 390331.19 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.57 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.21 | step: 343.04 | _step_clipping: 0.11 | _step_step: 341.20 | _step_zero_grad: 0.45 | _step_check_overflow: 0.73 samples/sec: 16.320 | iteration 11610/ 143000 | elapsed time per iteration (ms): 62744.3 | learning rate: 5.925E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.475749E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 09:36:58,074] [INFO] [logging.py:60:log_dist] [Rank 0] step=11620, skipped=9, lr=[0.0005925271116361181, 0.0005925271116361181], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11620 loss: 2.4711 iter time (s): 62.257 samples/sec: 16.448 %comms: 0.0029179413090788184 %optimizer_step 0.058242790192777776 %forward: 23.346647743088702 %backward: 62.69115390145888 [2025-04-02 09:36:58,075] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16492.50 | forward: 145349.84 | backward_microstep: 390307.88 | backward: 390297.96 | backward_inner_microstep: 390282.06 | backward_inner: 390275.97 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.66 | reduce_tied_grads: 0.35 | comms: 18.17 | reduce_grads: 0.22 | step: 362.60 | _step_clipping: 0.13 | _step_step: 360.80 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.448 | iteration 11620/ 143000 | elapsed time per iteration (ms): 62257.8 | learning rate: 5.925E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.469332E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 09:47:24,201] [INFO] [logging.py:60:log_dist] [Rank 0] step=11630, skipped=9, lr=[0.0005925124857610378, 0.0005925124857610378], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11630 loss: 2.4683 iter time (s): 62.612 samples/sec: 16.355 %comms: 0.0028763494060125997 %optimizer_step 0.056337981273240016 %forward: 23.23046800431677 %backward: 62.36065637977393 [2025-04-02 09:47:24,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19729.43 | forward: 145450.81 | backward_microstep: 390465.22 | backward: 390453.09 | backward_inner_microstep: 390436.62 | backward_inner: 390430.04 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.67 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.19 | step: 352.74 | _step_clipping: 0.11 | _step_step: 350.96 | _step_zero_grad: 0.49 | _step_check_overflow: 0.60 samples/sec: 16.355 | iteration 11630/ 143000 | elapsed time per iteration (ms): 62612.6 | learning rate: 5.925E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.472950E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 09:57:57,196] [INFO] [logging.py:60:log_dist] [Rank 0] step=11640, skipped=9, lr=[0.0005924978457679933, 0.0005924978457679933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11640 loss: 2.4820 iter time (s): 63.299 samples/sec: 16.177 %comms: 0.0028335768154112897 %optimizer_step 0.05474528412270208 %forward: 22.982395885568423 %backward: 61.65108792901181 [2025-04-02 09:57:57,196] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26882.01 | forward: 145476.04 | backward_microstep: 390252.12 | backward: 390244.61 | backward_inner_microstep: 390228.70 | backward_inner: 390222.71 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.18 | step: 346.53 | _step_clipping: 0.12 | _step_step: 344.90 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.177 | iteration 11640/ 143000 | elapsed time per iteration (ms): 63299.4 | learning rate: 5.925E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.473378E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 10:08:25,139] [INFO] [logging.py:60:log_dist] [Rank 0] step=11650, skipped=9, lr=[0.0005924831916576912, 0.0005924831916576912], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11650 loss: 2.4610 iter time (s): 62.794 samples/sec: 16.307 %comms: 0.002872013468497041 %optimizer_step 0.05575237958989751 %forward: 23.21121425905351 %backward: 62.20697353498402 [2025-04-02 10:08:25,140] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21105.84 | forward: 145751.99 | backward_microstep: 390630.76 | backward: 390621.10 | backward_inner_microstep: 390604.75 | backward_inner: 390598.42 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.68 | reduce_tied_grads: 0.29 | comms: 18.03 | reduce_grads: 0.19 | step: 350.09 | _step_clipping: 0.13 | _step_step: 348.37 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.307 | iteration 11650/ 143000 | elapsed time per iteration (ms): 62794.4 | learning rate: 5.925E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.465647E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 10:18:50,356] [INFO] [logging.py:60:log_dist] [Rank 0] step=11660, skipped=9, lr=[0.0005924685234308387, 0.0005924685234308387], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11660 loss: 2.4833 iter time (s): 62.521 samples/sec: 16.378 %comms: 0.002861543028475562 %optimizer_step 0.05547859971637551 %forward: 23.266730336847534 %backward: 62.463002070306686 [2025-04-02 10:18:50,357] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18788.22 | forward: 145466.24 | backward_microstep: 390534.59 | backward: 390525.78 | backward_inner_microstep: 390509.64 | backward_inner: 390503.45 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.63 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.18 | step: 346.86 | _step_clipping: 0.13 | _step_step: 345.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.378 | iteration 11660/ 143000 | elapsed time per iteration (ms): 62521.7 | learning rate: 5.925E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.480074E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 10:29:10,427] [INFO] [logging.py:60:log_dist] [Rank 0] step=11670, skipped=9, lr=[0.0005924538410881438, 0.0005924538410881438], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11670 loss: 2.4814 iter time (s): 62.007 samples/sec: 16.514 %comms: 0.002882984957149025 %optimizer_step 0.05675741817091553 %forward: 23.43384523168565 %backward: 62.94562716517201 [2025-04-02 10:29:10,427] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14052.01 | forward: 145305.12 | backward_microstep: 390310.28 | backward: 390303.93 | backward_inner_microstep: 390286.66 | backward_inner: 390281.00 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.59 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.18 | step: 351.93 | _step_clipping: 0.11 | _step_step: 350.30 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.514 | iteration 11670/ 143000 | elapsed time per iteration (ms): 62007.1 | learning rate: 5.925E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.479435E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 10:39:34,987] [INFO] [logging.py:60:log_dist] [Rank 0] step=11680, skipped=9, lr=[0.0005924391446303152, 0.0005924391446303152], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11680 loss: 2.4628 iter time (s): 62.455 samples/sec: 16.396 %comms: 0.0028752780687738132 %optimizer_step 0.05575947509207592 %forward: 23.25795663738097 %backward: 62.47527173537454 [2025-04-02 10:39:34,987] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18686.08 | forward: 145258.69 | backward_microstep: 390199.86 | backward: 390192.31 | backward_inner_microstep: 390176.52 | backward_inner: 390170.48 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.60 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.18 | step: 348.25 | _step_clipping: 0.11 | _step_step: 346.57 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.396 | iteration 11680/ 143000 | elapsed time per iteration (ms): 62456.0 | learning rate: 5.924E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.475438E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 10:50:03,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=11690, skipped=9, lr=[0.000592424434058062, 0.000592424434058062], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11690 loss: 2.4579 iter time (s): 62.801 samples/sec: 16.305 %comms: 0.00285467612796351 %optimizer_step 0.05505260793891232 %forward: 23.142217016019785 %backward: 62.11962391122236 [2025-04-02 10:50:03,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22164.13 | forward: 145335.40 | backward_microstep: 390124.83 | backward: 390117.34 | backward_inner_microstep: 390101.49 | backward_inner: 390095.50 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.19 | step: 345.74 | _step_clipping: 0.12 | _step_step: 344.08 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.305 | iteration 11690/ 143000 | elapsed time per iteration (ms): 62801.5 | learning rate: 5.924E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.465301E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 11:00:28,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=11700, skipped=9, lr=[0.0005924097093720944, 0.0005924097093720944], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11700 loss: 2.5038 iter time (s): 62.558 samples/sec: 16.369 %comms: 0.0028659704282638933 %optimizer_step 0.057967491372892595 %forward: 23.22100944410849 %backward: 62.36737029646561 [2025-04-02 11:00:28,593] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19777.89 | forward: 145267.12 | backward_microstep: 390170.58 | backward: 390160.83 | backward_inner_microstep: 390144.93 | backward_inner: 390138.90 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.19 | step: 362.64 | _step_clipping: 0.12 | _step_step: 360.67 | _step_zero_grad: 0.54 | _step_check_overflow: 0.72 samples/sec: 16.369 | iteration 11700/ 143000 | elapsed time per iteration (ms): 62559.1 | learning rate: 5.924E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.477573E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 11:10:56,584] [INFO] [logging.py:60:log_dist] [Rank 0] step=11710, skipped=9, lr=[0.0005923949705731231, 0.0005923949705731231], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11710 loss: 2.5000 iter time (s): 62.799 samples/sec: 16.306 %comms: 0.00286165933739806 %optimizer_step 0.05561057370091568 %forward: 23.132131927535013 %backward: 62.116202231941806 [2025-04-02 11:10:56,584] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22248.84 | forward: 145266.39 | backward_microstep: 390087.72 | backward: 390080.63 | backward_inner_microstep: 390061.51 | backward_inner: 390055.71 | backward_allreduce_microstep: 9.29 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.18 | step: 349.23 | _step_clipping: 0.13 | _step_step: 347.59 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.306 | iteration 11710/ 143000 | elapsed time per iteration (ms): 62799.1 | learning rate: 5.924E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.485063E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 11:21:28,259] [INFO] [logging.py:60:log_dist] [Rank 0] step=11720, skipped=9, lr=[0.0005923802176618593, 0.0005923802176618593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11720 loss: 2.4717 iter time (s): 63.167 samples/sec: 16.211 %comms: 0.0028615383033031875 %optimizer_step 0.05535567226584481 %forward: 23.015811618351663 %backward: 61.76778230168151 [2025-04-02 11:21:28,259] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25676.90 | forward: 145383.87 | backward_microstep: 390175.61 | backward: 390168.27 | backward_inner_microstep: 390152.74 | backward_inner: 390146.68 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.19 | step: 349.66 | _step_clipping: 0.10 | _step_step: 347.92 | _step_zero_grad: 0.54 | _step_check_overflow: 0.52 samples/sec: 16.211 | iteration 11720/ 143000 | elapsed time per iteration (ms): 63167.5 | learning rate: 5.924E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.474444E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 11:31:53,572] [INFO] [logging.py:60:log_dist] [Rank 0] step=11730, skipped=9, lr=[0.0005923654506390151, 0.0005923654506390151], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11730 loss: 2.4669 iter time (s): 62.531 samples/sec: 16.376 %comms: 0.0028662522894916504 %optimizer_step 0.05760639407041085 %forward: 23.229927194816458 %backward: 62.39229376147195 [2025-04-02 11:31:53,572] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19494.36 | forward: 145258.38 | backward_microstep: 390152.60 | backward: 390143.43 | backward_inner_microstep: 390126.10 | backward_inner: 390120.27 | backward_allreduce_microstep: 9.26 | backward_allreduce: 2.60 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.21 | step: 360.22 | _step_clipping: 0.13 | _step_step: 358.52 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.376 | iteration 11730/ 143000 | elapsed time per iteration (ms): 62531.3 | learning rate: 5.924E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.477888E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 11:42:20,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=11740, skipped=9, lr=[0.0005923506695053033, 0.0005923506695053033], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11740 loss: 2.4717 iter time (s): 62.739 samples/sec: 16.322 %comms: 0.0028462762676886274 %optimizer_step 0.054988777600361284 %forward: 23.141631846745938 %backward: 62.172302018862325 [2025-04-02 11:42:20,970] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21764.50 | forward: 145188.78 | backward_microstep: 390070.63 | backward: 390064.13 | backward_inner_microstep: 390047.06 | backward_inner: 390041.01 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.57 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 345.00 | _step_clipping: 0.12 | _step_step: 343.39 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.321 | iteration 11740/ 143000 | elapsed time per iteration (ms): 62739.7 | learning rate: 5.924E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.473732E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 11:52:50,104] [INFO] [logging.py:60:log_dist] [Rank 0] step=11750, skipped=9, lr=[0.0005923358742614372, 0.0005923358742614372], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11750 loss: 2.4704 iter time (s): 62.913 samples/sec: 16.276 %comms: 0.0028556602869869007 %optimizer_step 0.05614146115976989 %forward: 23.10738350870494 %backward: 62.029457305728265 [2025-04-02 11:52:50,104] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23066.66 | forward: 145375.30 | backward_microstep: 390252.77 | backward: 390245.45 | backward_inner_microstep: 390229.84 | backward_inner: 390223.83 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.59 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.19 | step: 353.20 | _step_clipping: 0.15 | _step_step: 351.52 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.276 | iteration 11750/ 143000 | elapsed time per iteration (ms): 62913.5 | learning rate: 5.923E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.466641E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 12:03:19,139] [INFO] [logging.py:60:log_dist] [Rank 0] step=11760, skipped=9, lr=[0.000592321064908131, 0.000592321064908131], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11760 loss: 2.4793 iter time (s): 62.903 samples/sec: 16.279 %comms: 0.0028618322584542033 %optimizer_step 0.05514957899492974 %forward: 23.123625722129386 %backward: 62.050076807438856 [2025-04-02 12:03:19,140] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22837.27 | forward: 145454.63 | backward_microstep: 390321.28 | backward: 390313.84 | backward_inner_microstep: 390298.22 | backward_inner: 390292.14 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.18 | step: 346.91 | _step_clipping: 0.11 | _step_step: 345.27 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.279 | iteration 11760/ 143000 | elapsed time per iteration (ms): 62903.6 | learning rate: 5.923E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.475008E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 12:13:42,962] [INFO] [logging.py:60:log_dist] [Rank 0] step=11770, skipped=9, lr=[0.0005923062414460994, 0.0005923062414460994], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11770 loss: 2.4743 iter time (s): 62.382 samples/sec: 16.415 %comms: 0.0028690863441986187 %optimizer_step 0.05617258216815877 %forward: 23.28709966185695 %backward: 62.554438882024165 [2025-04-02 12:13:42,962] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17900.01 | forward: 145268.85 | backward_microstep: 390232.75 | backward: 390225.14 | backward_inner_microstep: 390209.45 | backward_inner: 390203.32 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 350.41 | _step_clipping: 0.13 | _step_step: 348.74 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.415 | iteration 11770/ 143000 | elapsed time per iteration (ms): 62382.2 | learning rate: 5.923E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.476293E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 12:24:13,587] [INFO] [logging.py:60:log_dist] [Rank 0] step=11780, skipped=9, lr=[0.0005922914038760577, 0.0005922914038760577], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11780 loss: 2.4898 iter time (s): 63.062 samples/sec: 16.238 %comms: 0.002849591028281248 %optimizer_step 0.055737459116594605 %forward: 23.075241647328642 %backward: 61.89172727244492 [2025-04-02 12:24:13,588] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24316.29 | forward: 145517.04 | backward_microstep: 390309.21 | backward: 390301.48 | backward_inner_microstep: 390285.58 | backward_inner: 390279.52 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.20 | step: 351.49 | _step_clipping: 0.12 | _step_step: 349.78 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.238 | iteration 11780/ 143000 | elapsed time per iteration (ms): 63062.5 | learning rate: 5.923E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.472754E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 12:34:39,966] [INFO] [logging.py:60:log_dist] [Rank 0] step=11790, skipped=9, lr=[0.0005922765521987223, 0.0005922765521987223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11790 loss: 2.4558 iter time (s): 62.637 samples/sec: 16.348 %comms: 0.0028638066002424543 %optimizer_step 0.05597010403935932 %forward: 23.239498349009864 %backward: 62.34144682234015 [2025-04-02 12:34:39,967] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19819.85 | forward: 145566.15 | backward_microstep: 390499.76 | backward: 390490.55 | backward_inner_microstep: 390472.82 | backward_inner: 390466.71 | backward_allreduce_microstep: 9.22 | backward_allreduce: 2.57 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.23 | step: 350.58 | _step_clipping: 0.12 | _step_step: 348.94 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.348 | iteration 11790/ 143000 | elapsed time per iteration (ms): 62637.9 | learning rate: 5.923E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.469743E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 12:45:03,675] [INFO] [logging.py:60:log_dist] [Rank 0] step=11800, skipped=9, lr=[0.0005922616864148099, 0.0005922616864148099], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11800 loss: 2.4729 iter time (s): 62.370 samples/sec: 16.418 %comms: 0.0029024066203938207 %optimizer_step 0.05550436472899924 %forward: 23.313417244919556 %backward: 62.596932862967904 [2025-04-02 12:45:03,676] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17382.69 | forward: 145406.57 | backward_microstep: 390430.13 | backward: 390419.17 | backward_inner_microstep: 390403.27 | backward_inner: 390396.69 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.59 | reduce_tied_grads: 0.30 | comms: 18.10 | reduce_grads: 0.19 | step: 346.18 | _step_clipping: 0.12 | _step_step: 344.56 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.418 | iteration 11800/ 143000 | elapsed time per iteration (ms): 62370.9 | learning rate: 5.923E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.464739E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 12:55:31,730] [INFO] [logging.py:60:log_dist] [Rank 0] step=11810, skipped=9, lr=[0.000592246806525038, 0.000592246806525038], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11810 loss: 2.4785 iter time (s): 62.805 samples/sec: 16.304 %comms: 0.002852936955898001 %optimizer_step 0.05555473383200041 %forward: 23.202547268599133 %backward: 62.16134048591666 [2025-04-02 12:55:31,731] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21452.55 | forward: 145723.61 | backward_microstep: 390413.69 | backward: 390404.32 | backward_inner_microstep: 390387.70 | backward_inner: 390381.36 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.22 | step: 348.91 | _step_clipping: 0.12 | _step_step: 347.22 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.304 | iteration 11810/ 143000 | elapsed time per iteration (ms): 62805.5 | learning rate: 5.922E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.473755E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 13:06:00,082] [INFO] [logging.py:60:log_dist] [Rank 0] step=11820, skipped=9, lr=[0.0005922319125301246, 0.0005922319125301246], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11820 loss: 2.4969 iter time (s): 62.835 samples/sec: 16.297 %comms: 0.002861569912515362 %optimizer_step 0.05787014424462256 %forward: 23.189586204643735 %backward: 62.143466121937315 [2025-04-02 13:06:00,083] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21667.49 | forward: 145710.96 | backward_microstep: 390486.67 | backward: 390476.32 | backward_inner_microstep: 390459.52 | backward_inner: 390453.15 | backward_allreduce_microstep: 8.01 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.21 | step: 363.63 | _step_clipping: 0.14 | _step_step: 361.82 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.297 | iteration 11820/ 143000 | elapsed time per iteration (ms): 62835.2 | learning rate: 5.922E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.481560E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 13:16:35,339] [INFO] [logging.py:60:log_dist] [Rank 0] step=11830, skipped=9, lr=[0.0005922170044307888, 0.0005922170044307888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11830 loss: 2.4719 iter time (s): 63.525 samples/sec: 16.120 %comms: 0.002902413758269156 %optimizer_step 0.05685248805312557 %forward: 22.952206144465436 %backward: 61.456392814291135 [2025-04-02 13:16:35,340] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28560.38 | forward: 145804.21 | backward_microstep: 390414.80 | backward: 390402.59 | backward_inner_microstep: 390385.64 | backward_inner: 390379.01 | backward_allreduce_microstep: 8.01 | backward_allreduce: 2.80 | reduce_tied_grads: 0.35 | comms: 18.44 | reduce_grads: 0.24 | step: 361.16 | _step_clipping: 0.13 | _step_step: 359.28 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.119 | iteration 11830/ 143000 | elapsed time per iteration (ms): 63525.7 | learning rate: 5.922E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.471575E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 13:27:04,179] [INFO] [logging.py:60:log_dist] [Rank 0] step=11840, skipped=9, lr=[0.00059220208222775, 0.00059220208222775], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11840 loss: 2.4727 iter time (s): 62.883 samples/sec: 16.284 %comms: 0.002907011828244516 %optimizer_step 0.05879752083776926 %forward: 23.16523588083847 %backward: 62.085085170074095 [2025-04-02 13:27:04,180] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22265.15 | forward: 145670.79 | backward_microstep: 390424.54 | backward: 390411.89 | backward_inner_microstep: 390394.32 | backward_inner: 390387.63 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.82 | reduce_tied_grads: 0.34 | comms: 18.28 | reduce_grads: 0.22 | step: 369.74 | _step_clipping: 0.14 | _step_step: 367.79 | _step_zero_grad: 0.56 | _step_check_overflow: 0.62 samples/sec: 16.284 | iteration 11840/ 143000 | elapsed time per iteration (ms): 62884.0 | learning rate: 5.922E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.478917E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 13:37:28,863] [INFO] [logging.py:60:log_dist] [Rank 0] step=11850, skipped=9, lr=[0.0005921871459217285, 0.0005921871459217285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11850 loss: 2.4851 iter time (s): 62.468 samples/sec: 16.392 %comms: 0.0029071553065110746 %optimizer_step 0.058428669160329356 %forward: 23.310374617857164 %backward: 62.49281808215945 [2025-04-02 13:37:28,863] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18195.22 | forward: 145614.65 | backward_microstep: 390390.58 | backward: 390378.53 | backward_inner_microstep: 390361.00 | backward_inner: 390354.21 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.84 | reduce_tied_grads: 0.32 | comms: 18.16 | reduce_grads: 0.21 | step: 364.99 | _step_clipping: 0.13 | _step_step: 363.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.392 | iteration 11850/ 143000 | elapsed time per iteration (ms): 62468.3 | learning rate: 5.922E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.480431E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 13:47:59,794] [INFO] [logging.py:60:log_dist] [Rank 0] step=11860, skipped=9, lr=[0.000592172195513445, 0.000592172195513445], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11860 loss: 2.4604 iter time (s): 63.093 samples/sec: 16.230 %comms: 0.0028541416667426725 %optimizer_step 0.056110341500013546 %forward: 23.068486296397058 %backward: 61.85972479092495 [2025-04-02 13:47:59,795] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24620.20 | forward: 145545.04 | backward_microstep: 390299.88 | backward: 390288.98 | backward_inner_microstep: 390270.05 | backward_inner: 390263.61 | backward_allreduce_microstep: 10.05 | backward_allreduce: 4.68 | reduce_tied_grads: 0.32 | comms: 18.01 | reduce_grads: 0.23 | step: 354.01 | _step_clipping: 0.14 | _step_step: 352.18 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.230 | iteration 11860/ 143000 | elapsed time per iteration (ms): 63093.2 | learning rate: 5.922E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.471008E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 13:58:31,288] [INFO] [logging.py:60:log_dist] [Rank 0] step=11870, skipped=9, lr=[0.0005921572310036213, 0.0005921572310036213], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11870 loss: 2.4703 iter time (s): 63.149 samples/sec: 16.216 %comms: 0.002865117570749091 %optimizer_step 0.05663197272579632 %forward: 23.02307232133155 %backward: 61.801806886148704 [2025-04-02 13:58:31,288] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25370.02 | forward: 145387.92 | backward_microstep: 390280.93 | backward: 390270.93 | backward_inner_microstep: 390254.22 | backward_inner: 390247.79 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.73 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.21 | step: 357.62 | _step_clipping: 0.12 | _step_step: 355.82 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 16.216 | iteration 11870/ 143000 | elapsed time per iteration (ms): 63149.4 | learning rate: 5.922E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.465145E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 14:09:02,247] [INFO] [logging.py:60:log_dist] [Rank 0] step=11880, skipped=9, lr=[0.0005921422523929795, 0.0005921422523929795], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11880 loss: 2.4577 iter time (s): 63.095 samples/sec: 16.229 %comms: 0.0028660683364342705 %optimizer_step 0.056753541487524164 %forward: 23.07638889373822 %backward: 61.84733546318396 [2025-04-02 14:09:02,248] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24668.13 | forward: 145601.40 | backward_microstep: 390238.39 | backward: 390228.23 | backward_inner_microstep: 390211.75 | backward_inner: 390205.40 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.19 | step: 358.09 | _step_clipping: 0.13 | _step_step: 356.30 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.229 | iteration 11880/ 143000 | elapsed time per iteration (ms): 63096.0 | learning rate: 5.921E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.466821E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 14:19:29,357] [INFO] [logging.py:60:log_dist] [Rank 0] step=11890, skipped=9, lr=[0.0005921272596822426, 0.0005921272596822426], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11890 loss: 2.4555 iter time (s): 62.710 samples/sec: 16.329 %comms: 0.0029162856517817126 %optimizer_step 0.05842034233434993 %forward: 23.209564880413673 %backward: 62.23826860462265 [2025-04-02 14:19:29,357] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20781.85 | forward: 145548.04 | backward_microstep: 390309.80 | backward: 390298.49 | backward_inner_microstep: 390282.21 | backward_inner: 390275.61 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.62 | reduce_tied_grads: 0.34 | comms: 18.29 | reduce_grads: 0.21 | step: 366.36 | _step_clipping: 0.13 | _step_step: 364.47 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.329 | iteration 11890/ 143000 | elapsed time per iteration (ms): 62710.9 | learning rate: 5.921E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.463766E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 14:29:58,411] [INFO] [logging.py:60:log_dist] [Rank 0] step=11900, skipped=9, lr=[0.0005921122528721343, 0.0005921122528721343], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11900 loss: 2.4666 iter time (s): 62.905 samples/sec: 16.279 %comms: 0.0028661482229874386 %optimizer_step 0.055769344674127336 %forward: 23.109615653260384 %backward: 62.04876702630659 [2025-04-02 14:29:58,411] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22884.02 | forward: 145370.60 | backward_microstep: 390327.95 | backward: 390316.60 | backward_inner_microstep: 390297.81 | backward_inner: 390291.23 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.75 | reduce_tied_grads: 0.36 | comms: 18.03 | reduce_grads: 0.19 | step: 350.82 | _step_clipping: 0.13 | _step_step: 349.09 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.278 | iteration 11900/ 143000 | elapsed time per iteration (ms): 62905.4 | learning rate: 5.921E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.468764E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 14:40:25,320] [INFO] [logging.py:60:log_dist] [Rank 0] step=11910, skipped=9, lr=[0.0005920972319633787, 0.0005920972319633787], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11910 loss: 2.4688 iter time (s): 62.690 samples/sec: 16.334 %comms: 0.0028733652292529926 %optimizer_step 0.05716183862710504 %forward: 23.169111284765602 %backward: 62.22734558120795 [2025-04-02 14:40:25,321] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21110.65 | forward: 145248.06 | backward_microstep: 390113.90 | backward: 390105.66 | backward_inner_microstep: 390088.92 | backward_inner: 390082.62 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.74 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.21 | step: 358.35 | _step_clipping: 0.12 | _step_step: 356.51 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.334 | iteration 11910/ 143000 | elapsed time per iteration (ms): 62691.0 | learning rate: 5.921E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.465428E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 14:50:55,884] [INFO] [logging.py:60:log_dist] [Rank 0] step=11920, skipped=9, lr=[0.0005920821969567009, 0.0005920821969567009], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11920 loss: 2.4691 iter time (s): 63.056 samples/sec: 16.240 %comms: 0.002859701162491398 %optimizer_step 0.057577877784966695 %forward: 23.071965566632922 %backward: 61.88622993132198 [2025-04-02 14:50:55,884] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24396.39 | forward: 145482.12 | backward_microstep: 390238.35 | backward: 390228.56 | backward_inner_microstep: 390210.87 | backward_inner: 390204.31 | backward_allreduce_microstep: 8.65 | backward_allreduce: 3.02 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.22 | step: 363.06 | _step_clipping: 0.14 | _step_step: 361.15 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 16.239 | iteration 11920/ 143000 | elapsed time per iteration (ms): 63056.4 | learning rate: 5.921E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.471931E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 15:01:40,782] [INFO] [logging.py:60:log_dist] [Rank 0] step=11930, skipped=9, lr=[0.0005920671478528266, 0.0005920671478528266], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11930 loss: 2.4752 iter time (s): 64.489 samples/sec: 15.879 %comms: 0.0027732530393647895 %optimizer_step 0.05389800072331281 %forward: 22.585955515410085 %backward: 60.500320387383965 [2025-04-02 15:01:40,782] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38619.87 | forward: 145655.07 | backward_microstep: 390170.55 | backward: 390161.85 | backward_inner_microstep: 390143.50 | backward_inner: 390137.15 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.74 | reduce_tied_grads: 0.29 | comms: 17.88 | reduce_grads: 0.19 | step: 347.58 | _step_clipping: 0.12 | _step_step: 345.97 | _step_zero_grad: 0.45 | _step_check_overflow: 0.49 samples/sec: 15.878 | iteration 11930/ 143000 | elapsed time per iteration (ms): 64489.8 | learning rate: 5.921E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.469130E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 15:12:13,526] [INFO] [logging.py:60:log_dist] [Rank 0] step=11940, skipped=9, lr=[0.000592052084652482, 0.000592052084652482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11940 loss: 2.4724 iter time (s): 63.274 samples/sec: 16.184 %comms: 0.0028538401053609824 %optimizer_step 0.05630699100377718 %forward: 23.009245388396657 %backward: 61.68412227725085 [2025-04-02 15:12:13,526] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26347.95 | forward: 145588.36 | backward_microstep: 390309.70 | backward: 390299.22 | backward_inner_microstep: 390281.95 | backward_inner: 390275.50 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.77 | reduce_tied_grads: 0.29 | comms: 18.06 | reduce_grads: 0.20 | step: 356.28 | _step_clipping: 0.12 | _step_step: 354.40 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.183 | iteration 11940/ 143000 | elapsed time per iteration (ms): 63274.4 | learning rate: 5.921E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.467472E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 15:22:42,556] [INFO] [logging.py:60:log_dist] [Rank 0] step=11950, skipped=9, lr=[0.0005920370073563943, 0.0005920370073563943], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11950 loss: 2.4683 iter time (s): 62.902 samples/sec: 16.279 %comms: 0.0029424432615885623 %optimizer_step 0.05882312913440525 %forward: 23.15590184166501 %backward: 62.08431263526857 [2025-04-02 15:22:42,557] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22308.65 | forward: 145656.17 | backward_microstep: 390536.90 | backward: 390525.19 | backward_inner_microstep: 390507.59 | backward_inner: 390500.92 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.89 | reduce_tied_grads: 0.37 | comms: 18.51 | reduce_grads: 0.24 | step: 370.01 | _step_clipping: 0.18 | _step_step: 367.95 | _step_zero_grad: 0.58 | _step_check_overflow: 0.63 samples/sec: 16.279 | iteration 11950/ 143000 | elapsed time per iteration (ms): 62903.0 | learning rate: 5.920E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.481924E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 15:33:08,892] [INFO] [logging.py:60:log_dist] [Rank 0] step=11960, skipped=9, lr=[0.0005920219159652909, 0.0005920219159652909], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11960 loss: 2.4755 iter time (s): 62.633 samples/sec: 16.349 %comms: 0.0028632423160023057 %optimizer_step 0.05616732973551876 %forward: 23.237869271040225 %backward: 62.331273507518034 [2025-04-02 15:33:08,893] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19922.31 | forward: 145545.93 | backward_microstep: 390407.70 | backward: 390399.97 | backward_inner_microstep: 390383.70 | backward_inner: 390377.62 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.65 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.19 | step: 351.79 | _step_clipping: 0.13 | _step_step: 350.00 | _step_zero_grad: 0.49 | _step_check_overflow: 0.61 samples/sec: 16.349 | iteration 11960/ 143000 | elapsed time per iteration (ms): 62633.6 | learning rate: 5.920E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.480604E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 15:43:40,588] [INFO] [logging.py:60:log_dist] [Rank 0] step=11970, skipped=9, lr=[0.0005920068104799005, 0.0005920068104799005], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11970 loss: 2.4740 iter time (s): 63.169 samples/sec: 16.210 %comms: 0.002852425951547869 %optimizer_step 0.05636729592808622 %forward: 23.041787306466585 %backward: 61.79960537242726 [2025-04-02 15:43:40,589] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25288.85 | forward: 145552.62 | backward_microstep: 390390.95 | backward: 390381.81 | backward_inner_microstep: 390364.70 | backward_inner: 390358.30 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.20 | step: 356.07 | _step_clipping: 0.13 | _step_step: 354.32 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.210 | iteration 11970/ 143000 | elapsed time per iteration (ms): 63169.6 | learning rate: 5.920E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.485064E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 15:54:09,536] [INFO] [logging.py:60:log_dist] [Rank 0] step=11980, skipped=9, lr=[0.0005919916909009521, 0.0005919916909009521], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11980 loss: 2.4766 iter time (s): 62.894 samples/sec: 16.281 %comms: 0.002874896514064871 %optimizer_step 0.05640243015128567 %forward: 23.14483416769643 %backward: 62.07782519126147 [2025-04-02 15:54:09,536] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22430.59 | forward: 145567.54 | backward_microstep: 390444.35 | backward: 390433.41 | backward_inner_microstep: 390416.34 | backward_inner: 390409.71 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.22 | step: 354.74 | _step_clipping: 0.13 | _step_step: 352.94 | _step_zero_grad: 0.53 | _step_check_overflow: 0.52 samples/sec: 16.281 | iteration 11980/ 143000 | elapsed time per iteration (ms): 62894.7 | learning rate: 5.920E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.475356E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 16:04:35,716] [INFO] [logging.py:60:log_dist] [Rank 0] step=11990, skipped=9, lr=[0.0005919765572291752, 0.0005919765572291752], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 11990 loss: 2.4725 iter time (s): 62.618 samples/sec: 16.353 %comms: 0.002882839735718856 %optimizer_step 0.056595214891998466 %forward: 23.245294385045785 %backward: 62.33854833853118 [2025-04-02 16:04:35,717] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19783.03 | forward: 145556.24 | backward_microstep: 390356.90 | backward: 390348.46 | backward_inner_microstep: 390332.21 | backward_inner: 390325.85 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.65 | reduce_tied_grads: 0.30 | comms: 18.05 | reduce_grads: 0.19 | step: 354.39 | _step_clipping: 0.14 | _step_step: 352.62 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.353 | iteration 11990/ 143000 | elapsed time per iteration (ms): 62618.1 | learning rate: 5.920E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.472078E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 16:15:03,888] [INFO] [logging.py:60:log_dist] [Rank 0] step=12000, skipped=9, lr=[0.0005919614094653005, 0.0005919614094653005], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12000 loss: 2.4634 iter time (s): 62.817 samples/sec: 16.301 %comms: 0.0028773423502418344 %optimizer_step 0.055494866569228266 %forward: 23.178057297208614 %backward: 62.17854852000957 [2025-04-02 16:15:03,889] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21446.92 | forward: 145596.89 | backward_microstep: 390596.29 | backward: 390585.07 | backward_inner_microstep: 390567.21 | backward_inner: 390560.85 | backward_allreduce_microstep: 9.32 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.18 | step: 348.60 | _step_clipping: 0.11 | _step_step: 346.92 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.301 | iteration 12000/ 143000 | elapsed time per iteration (ms): 62817.2 | learning rate: 5.920E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.465308E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 16:15:06,687] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step12000/mp_rank_00_model_states.pt [2025-04-02 16:15:20,551] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-02 16:15:20,555] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step12000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-02 16:25:48,908] [INFO] [logging.py:60:log_dist] [Rank 0] step=12010, skipped=9, lr=[0.000591946247610059, 0.000591946247610059], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12010 loss: 2.4720 iter time (s): 62.834 samples/sec: 16.297 %comms: 0.002891191916740406 %optimizer_step 0.05597637595838187 %forward: 23.17059913048158 %backward: 62.105505588216445 [2025-04-02 16:25:48,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22043.35 | forward: 145590.36 | backward_microstep: 390242.84 | backward: 390234.31 | backward_inner_microstep: 390216.41 | backward_inner: 390210.25 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.66 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.19 | step: 351.72 | _step_clipping: 0.13 | _step_step: 349.88 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 15.876 | iteration 12010/ 143000 | elapsed time per iteration (ms): 64501.9 | learning rate: 5.919E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.472754E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 16:36:21,596] [INFO] [logging.py:60:log_dist] [Rank 0] step=12020, skipped=9, lr=[0.0005919310716641823, 0.0005919310716641823], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12020 loss: 2.4766 iter time (s): 63.268 samples/sec: 16.185 %comms: 0.0028648706649872487 %optimizer_step 0.05627527881698803 %forward: 22.998838286353447 %backward: 61.69464074345501 [2025-04-02 16:36:21,596] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26378.34 | forward: 145509.61 | backward_microstep: 390340.47 | backward: 390331.17 | backward_inner_microstep: 390314.00 | backward_inner: 390307.54 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.84 | reduce_tied_grads: 0.30 | comms: 18.13 | reduce_grads: 0.21 | step: 356.04 | _step_clipping: 0.15 | _step_step: 354.19 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.185 | iteration 12020/ 143000 | elapsed time per iteration (ms): 63268.8 | learning rate: 5.919E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.480699E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 16:46:50,814] [INFO] [logging.py:60:log_dist] [Rank 0] step=12030, skipped=9, lr=[0.0005919158816284031, 0.0005919158816284031], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12030 loss: 2.4746 iter time (s): 62.921 samples/sec: 16.274 %comms: 0.002868200530318702 %optimizer_step 0.05612840520188018 %forward: 23.109247217571717 %backward: 62.0321645035678 [2025-04-02 16:46:50,815] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23010.46 | forward: 145406.42 | backward_microstep: 390323.06 | backward: 390314.52 | backward_inner_microstep: 390297.50 | backward_inner: 390291.26 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.82 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.20 | step: 353.17 | _step_clipping: 0.14 | _step_step: 351.19 | _step_zero_grad: 0.52 | _step_check_overflow: 0.72 samples/sec: 16.274 | iteration 12030/ 143000 | elapsed time per iteration (ms): 62921.9 | learning rate: 5.919E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.471707E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 16:57:13,663] [INFO] [logging.py:60:log_dist] [Rank 0] step=12040, skipped=9, lr=[0.0005919006775034546, 0.0005919006775034546], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12040 loss: 2.4800 iter time (s): 62.284 samples/sec: 16.441 %comms: 0.0028764039971154254 %optimizer_step 0.05566599468566877 %forward: 23.344004392911955 %backward: 62.65603628012882 [2025-04-02 16:57:13,663] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16704.65 | forward: 145396.55 | backward_microstep: 390256.59 | backward: 390248.87 | backward_inner_microstep: 390232.72 | backward_inner: 390226.66 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.70 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.20 | step: 346.71 | _step_clipping: 0.13 | _step_step: 344.90 | _step_zero_grad: 0.61 | _step_check_overflow: 0.52 samples/sec: 16.441 | iteration 12040/ 143000 | elapsed time per iteration (ms): 62284.9 | learning rate: 5.919E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.463164E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 17:07:45,379] [INFO] [logging.py:60:log_dist] [Rank 0] step=12050, skipped=9, lr=[0.0005918854592900703, 0.0005918854592900703], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12050 loss: 2.4645 iter time (s): 63.171 samples/sec: 16.210 %comms: 0.002852258707758107 %optimizer_step 0.05707521660327689 %forward: 23.067616653461663 %backward: 61.79392716503897 [2025-04-02 17:07:45,380] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25078.80 | forward: 145720.47 | backward_microstep: 390368.95 | backward: 390358.50 | backward_inner_microstep: 390341.84 | backward_inner: 390335.42 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.72 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 360.55 | _step_clipping: 0.13 | _step_step: 358.72 | _step_zero_grad: 0.54 | _step_check_overflow: 0.57 samples/sec: 16.210 | iteration 12050/ 143000 | elapsed time per iteration (ms): 63171.6 | learning rate: 5.919E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.477719E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 17:18:13,498] [INFO] [logging.py:60:log_dist] [Rank 0] step=12060, skipped=9, lr=[0.0005918702269889849, 0.0005918702269889849], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12060 loss: 2.4902 iter time (s): 62.811 samples/sec: 16.303 %comms: 0.002852571332551484 %optimizer_step 0.056258714648408224 %forward: 23.156284193522062 %backward: 62.15339800253103 [2025-04-02 17:18:13,499] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21715.75 | forward: 145447.82 | backward_microstep: 390403.84 | backward: 390394.08 | backward_inner_microstep: 390377.65 | backward_inner: 390371.29 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.66 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 353.37 | _step_clipping: 0.12 | _step_step: 351.64 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.303 | iteration 12060/ 143000 | elapsed time per iteration (ms): 62811.9 | learning rate: 5.919E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.468250E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 17:28:45,500] [INFO] [logging.py:60:log_dist] [Rank 0] step=12070, skipped=9, lr=[0.0005918549806009336, 0.0005918549806009336], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12070 loss: 2.4613 iter time (s): 63.200 samples/sec: 16.203 %comms: 0.0028491920870362896 %optimizer_step 0.05874237640610191 %forward: 23.025822885123702 %backward: 61.758145882654794 [2025-04-02 17:28:45,501] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25584.82 | forward: 145522.45 | backward_microstep: 390319.56 | backward: 390309.48 | backward_inner_microstep: 390292.95 | backward_inner: 390286.56 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.19 | step: 371.25 | _step_clipping: 0.14 | _step_step: 369.46 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.202 | iteration 12070/ 143000 | elapsed time per iteration (ms): 63200.3 | learning rate: 5.919E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.459520E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 17:39:12,907] [INFO] [logging.py:60:log_dist] [Rank 0] step=12080, skipped=9, lr=[0.0005918397201266522, 0.0005918397201266522], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12080 loss: 2.4578 iter time (s): 62.740 samples/sec: 16.321 %comms: 0.0028837078773572616 %optimizer_step 0.05759173332119824 %forward: 23.191995618569834 %backward: 62.2000918204357 [2025-04-02 17:39:12,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21068.92 | forward: 145506.67 | backward_microstep: 390254.18 | backward: 390243.61 | backward_inner_microstep: 390226.01 | backward_inner: 390219.39 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.86 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.22 | step: 361.33 | _step_clipping: 0.12 | _step_step: 359.50 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.321 | iteration 12080/ 143000 | elapsed time per iteration (ms): 62740.6 | learning rate: 5.918E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.461743E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 17:49:35,481] [INFO] [logging.py:60:log_dist] [Rank 0] step=12090, skipped=9, lr=[0.0005918244455668774, 0.0005918244455668774], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12090 loss: 2.4464 iter time (s): 62.257 samples/sec: 16.448 %comms: 0.0029024901359806575 %optimizer_step 0.058816756183128406 %forward: 23.340154437268048 %backward: 62.69213059802754 [2025-04-02 17:49:35,481] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16375.79 | forward: 145308.39 | backward_microstep: 390312.43 | backward: 390301.31 | backward_inner_microstep: 390284.99 | backward_inner: 390278.68 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 18.07 | reduce_grads: 0.18 | step: 366.17 | _step_clipping: 0.12 | _step_step: 364.37 | _step_zero_grad: 0.54 | _step_check_overflow: 0.56 samples/sec: 16.448 | iteration 12090/ 143000 | elapsed time per iteration (ms): 62257.4 | learning rate: 5.918E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.459713E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 18:00:01,934] [INFO] [logging.py:60:log_dist] [Rank 0] step=12100, skipped=9, lr=[0.0005918091569223462, 0.0005918091569223462], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12100 loss: 2.4489 iter time (s): 62.645 samples/sec: 16.346 %comms: 0.002873821447479947 %optimizer_step 0.058014428419623794 %forward: 23.20513388262577 %backward: 62.281297649332565 [2025-04-02 18:00:01,934] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20396.95 | forward: 145368.02 | backward_microstep: 390168.48 | backward: 390159.73 | backward_inner_microstep: 390142.95 | backward_inner: 390136.67 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.75 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.20 | step: 363.43 | _step_clipping: 0.12 | _step_step: 361.55 | _step_zero_grad: 0.52 | _step_check_overflow: 0.65 samples/sec: 16.346 | iteration 12100/ 143000 | elapsed time per iteration (ms): 62645.3 | learning rate: 5.918E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.464422E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 18:10:24,405] [INFO] [logging.py:60:log_dist] [Rank 0] step=12110, skipped=9, lr=[0.0005917938541937965, 0.0005917938541937965], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12110 loss: 2.4564 iter time (s): 62.247 samples/sec: 16.451 %comms: 0.0028892197947105076 %optimizer_step 0.057195842609589286 %forward: 23.35352748156737 %backward: 62.68296272479126 [2025-04-02 18:10:24,405] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16386.12 | forward: 145367.60 | backward_microstep: 390187.46 | backward: 390179.68 | backward_inner_microstep: 390163.03 | backward_inner: 390156.84 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.74 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.20 | step: 356.02 | _step_clipping: 0.12 | _step_step: 354.28 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.451 | iteration 12110/ 143000 | elapsed time per iteration (ms): 62247.1 | learning rate: 5.918E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.467186E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 18:20:46,090] [INFO] [logging.py:60:log_dist] [Rank 0] step=12120, skipped=9, lr=[0.000591778537381967, 0.000591778537381967], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12120 loss: 2.4797 iter time (s): 62.168 samples/sec: 16.472 %comms: 0.0028753065493500483 %optimizer_step 0.0565612077060044 %forward: 23.365896105220827 %backward: 62.75072484041304 [2025-04-02 18:20:46,090] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15801.91 | forward: 145261.01 | backward_microstep: 390116.63 | backward: 390108.46 | backward_inner_microstep: 390092.21 | backward_inner: 390086.07 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.69 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.19 | step: 351.63 | _step_clipping: 0.11 | _step_step: 349.93 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.471 | iteration 12120/ 143000 | elapsed time per iteration (ms): 62168.5 | learning rate: 5.918E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.464142E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 18:31:12,137] [INFO] [logging.py:60:log_dist] [Rank 0] step=12130, skipped=9, lr=[0.0005917632064875968, 0.0005917632064875968], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12130 loss: 2.4678 iter time (s): 62.604 samples/sec: 16.357 %comms: 0.002873781992859792 %optimizer_step 0.05665604454486698 %forward: 23.206378401558187 %backward: 62.32241584899864 [2025-04-02 18:31:12,137] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20056.47 | forward: 145281.54 | backward_microstep: 390173.81 | backward: 390164.15 | backward_inner_microstep: 390147.23 | backward_inner: 390140.91 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.19 | step: 354.69 | _step_clipping: 0.13 | _step_step: 352.95 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.357 | iteration 12130/ 143000 | elapsed time per iteration (ms): 62604.7 | learning rate: 5.918E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.471340E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 18:41:44,817] [INFO] [logging.py:60:log_dist] [Rank 0] step=12140, skipped=9, lr=[0.0005917478615114261, 0.0005917478615114261], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12140 loss: 2.4820 iter time (s): 63.267 samples/sec: 16.185 %comms: 0.0028406740987581 %optimizer_step 0.055798193751753704 %forward: 22.986882830762063 %backward: 61.68432006696172 [2025-04-02 18:41:44,818] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26441.69 | forward: 145432.23 | backward_microstep: 390269.27 | backward: 390261.19 | backward_inner_microstep: 390244.59 | backward_inner: 390238.38 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.76 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.22 | step: 353.02 | _step_clipping: 0.14 | _step_step: 351.21 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.185 | iteration 12140/ 143000 | elapsed time per iteration (ms): 63268.1 | learning rate: 5.917E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.472585E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 18:52:12,325] [INFO] [logging.py:60:log_dist] [Rank 0] step=12150, skipped=9, lr=[0.0005917325024541953, 0.0005917325024541953], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12150 loss: 2.4826 iter time (s): 62.750 samples/sec: 16.319 %comms: 0.002853413116729677 %optimizer_step 0.057416751084349134 %forward: 23.189414646779184 %backward: 62.21034993478429 [2025-04-02 18:52:12,326] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21009.35 | forward: 145514.13 | backward_microstep: 390379.20 | backward: 390371.43 | backward_inner_microstep: 390355.29 | backward_inner: 390349.18 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.68 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.18 | step: 360.29 | _step_clipping: 0.11 | _step_step: 358.68 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.319 | iteration 12150/ 143000 | elapsed time per iteration (ms): 62750.8 | learning rate: 5.917E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.467055E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 19:02:41,642] [INFO] [logging.py:60:log_dist] [Rank 0] step=12160, skipped=9, lr=[0.0005917171293166458, 0.0005917171293166458], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12160 loss: 2.4536 iter time (s): 62.931 samples/sec: 16.272 %comms: 0.0028748439896949408 %optimizer_step 0.05751627728108582 %forward: 23.14990380560684 %backward: 62.04624528021393 [2025-04-02 19:02:41,643] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22569.25 | forward: 145684.64 | backward_microstep: 390473.02 | backward: 390463.18 | backward_inner_microstep: 390446.69 | backward_inner: 390440.38 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.71 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.21 | step: 361.96 | _step_clipping: 0.13 | _step_step: 360.07 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.272 | iteration 12160/ 143000 | elapsed time per iteration (ms): 62931.7 | learning rate: 5.917E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.461823E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 19:13:09,539] [INFO] [logging.py:60:log_dist] [Rank 0] step=12170, skipped=9, lr=[0.0005917017420995195, 0.0005917017420995195], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12170 loss: 2.4660 iter time (s): 62.789 samples/sec: 16.309 %comms: 0.0028723778413313576 %optimizer_step 0.05858278513112817 %forward: 23.19656196990815 %backward: 62.18954021471295 [2025-04-02 19:13:09,540] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21127.97 | forward: 145649.20 | backward_microstep: 390493.27 | backward: 390482.74 | backward_inner_microstep: 390464.05 | backward_inner: 390457.67 | backward_allreduce_microstep: 10.17 | backward_allreduce: 2.92 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.19 | step: 367.84 | _step_clipping: 0.11 | _step_step: 366.07 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.308 | iteration 12170/ 143000 | elapsed time per iteration (ms): 62789.8 | learning rate: 5.917E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.461169E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 19:23:38,692] [INFO] [logging.py:60:log_dist] [Rank 0] step=12180, skipped=9, lr=[0.0005916863408035591, 0.0005916863408035591], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12180 loss: 2.4564 iter time (s): 62.915 samples/sec: 16.276 %comms: 0.002841107391371549 %optimizer_step 0.055504329047954035 %forward: 23.112073972944387 %backward: 62.05102116450877 [2025-04-02 19:23:38,692] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22756.85 | forward: 145408.72 | backward_microstep: 390399.79 | backward: 390391.61 | backward_inner_microstep: 390375.50 | backward_inner: 390369.36 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.65 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.19 | step: 349.20 | _step_clipping: 0.13 | _step_step: 347.47 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.276 | iteration 12180/ 143000 | elapsed time per iteration (ms): 62915.2 | learning rate: 5.917E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.464948E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 19:34:03,456] [INFO] [logging.py:60:log_dist] [Rank 0] step=12190, skipped=9, lr=[0.000591670925429508, 0.000591670925429508], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12190 loss: 2.4751 iter time (s): 62.476 samples/sec: 16.390 %comms: 0.0028669336239327205 %optimizer_step 0.05567982284339191 %forward: 23.266842151630794 %backward: 62.47309598445536 [2025-04-02 19:34:03,457] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18537.97 | forward: 145361.76 | backward_microstep: 390314.19 | backward: 390306.47 | backward_inner_microstep: 390290.45 | backward_inner: 390284.54 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.59 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 347.86 | _step_clipping: 0.12 | _step_step: 346.16 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.390 | iteration 12190/ 143000 | elapsed time per iteration (ms): 62476.5 | learning rate: 5.917E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.465791E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 19:44:27,327] [INFO] [logging.py:60:log_dist] [Rank 0] step=12200, skipped=9, lr=[0.0005916554959781101, 0.0005916554959781101], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12200 loss: 2.4549 iter time (s): 62.387 samples/sec: 16.414 %comms: 0.0028712721295831908 %optimizer_step 0.056467472905348245 %forward: 23.287631383152043 %backward: 62.553298361514976 [2025-04-02 19:44:27,327] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17752.92 | forward: 145283.40 | backward_microstep: 390255.11 | backward: 390248.20 | backward_inner_microstep: 390232.59 | backward_inner: 390226.80 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.59 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.19 | step: 352.28 | _step_clipping: 0.14 | _step_step: 350.60 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.414 | iteration 12200/ 143000 | elapsed time per iteration (ms): 62387.0 | learning rate: 5.917E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.466428E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 19:54:51,491] [INFO] [logging.py:60:log_dist] [Rank 0] step=12210, skipped=9, lr=[0.0005916400524501101, 0.0005916400524501101], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12210 loss: 2.4595 iter time (s): 62.416 samples/sec: 16.406 %comms: 0.002876262959309248 %optimizer_step 0.05648316869516374 %forward: 23.287662477071358 %backward: 62.543041290970294 [2025-04-02 19:54:51,491] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17828.62 | forward: 145351.94 | backward_microstep: 390375.40 | backward: 390367.76 | backward_inner_microstep: 390352.18 | backward_inner: 390346.26 | backward_allreduce_microstep: 7.45 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.19 | step: 352.54 | _step_clipping: 0.11 | _step_step: 350.80 | _step_zero_grad: 0.48 | _step_check_overflow: 0.60 samples/sec: 16.406 | iteration 12210/ 143000 | elapsed time per iteration (ms): 62416.4 | learning rate: 5.916E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.461825E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 20:05:21,059] [INFO] [logging.py:60:log_dist] [Rank 0] step=12220, skipped=9, lr=[0.0005916245948462534, 0.0005916245948462534], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12220 loss: 2.4735 iter time (s): 62.956 samples/sec: 16.265 %comms: 0.0028835745412149703 %optimizer_step 0.056806641897967025 %forward: 23.10716843277803 %backward: 61.9951207796499 [2025-04-02 20:05:21,060] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23131.62 | forward: 145474.08 | backward_microstep: 390305.71 | backward: 390298.07 | backward_inner_microstep: 390282.28 | backward_inner: 390276.26 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.61 | reduce_tied_grads: 0.32 | comms: 18.15 | reduce_grads: 0.20 | step: 357.63 | _step_clipping: 0.13 | _step_step: 355.88 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.265 | iteration 12220/ 143000 | elapsed time per iteration (ms): 62956.8 | learning rate: 5.916E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.467317E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 20:15:44,715] [INFO] [logging.py:60:log_dist] [Rank 0] step=12230, skipped=9, lr=[0.0005916091231672862, 0.0005916091231672862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12230 loss: 2.4626 iter time (s): 62.365 samples/sec: 16.419 %comms: 0.0028812467434155024 %optimizer_step 0.05625589633515293 %forward: 23.308246960284112 %backward: 62.59205993739467 [2025-04-02 20:15:44,715] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17339.01 | forward: 145361.86 | backward_microstep: 390363.69 | backward: 390355.33 | backward_inner_microstep: 390339.73 | backward_inner: 390333.57 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 350.84 | _step_clipping: 0.13 | _step_step: 349.08 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.419 | iteration 12230/ 143000 | elapsed time per iteration (ms): 62365.6 | learning rate: 5.916E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.463997E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 20:26:17,641] [INFO] [logging.py:60:log_dist] [Rank 0] step=12240, skipped=9, lr=[0.000591593637413955, 0.000591593637413955], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12240 loss: 2.4768 iter time (s): 63.292 samples/sec: 16.179 %comms: 0.002840854616567664 %optimizer_step 0.05566500458232317 %forward: 22.981930985956126 %backward: 61.67991270553074 [2025-04-02 20:26:17,641] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26459.78 | forward: 145457.24 | backward_microstep: 390391.50 | backward: 390384.51 | backward_inner_microstep: 390368.90 | backward_inner: 390363.00 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.56 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.18 | step: 352.31 | _step_clipping: 0.13 | _step_step: 350.69 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.179 | iteration 12240/ 143000 | elapsed time per iteration (ms): 63292.6 | learning rate: 5.916E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.463112E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 20:36:38,361] [INFO] [logging.py:60:log_dist] [Rank 0] step=12250, skipped=9, lr=[0.0005915781375870073, 0.0005915781375870073], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12250 loss: 2.4632 iter time (s): 62.072 samples/sec: 16.497 %comms: 0.002896213232042573 %optimizer_step 0.05581934194212776 %forward: 23.41865539924053 %backward: 62.90934137998537 [2025-04-02 20:36:38,362] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14250.27 | forward: 145363.17 | backward_microstep: 390500.27 | backward: 390487.89 | backward_inner_microstep: 390469.98 | backward_inner: 390463.84 | backward_allreduce_microstep: 7.82 | backward_allreduce: 2.82 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.19 | step: 346.48 | _step_clipping: 0.13 | _step_step: 344.83 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.497 | iteration 12250/ 143000 | elapsed time per iteration (ms): 62072.1 | learning rate: 5.916E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.469280E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 20:47:03,637] [INFO] [logging.py:60:log_dist] [Rank 0] step=12260, skipped=9, lr=[0.0005915626236871912, 0.0005915626236871912], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12260 loss: 2.4773 iter time (s): 62.527 samples/sec: 16.377 %comms: 0.002857385176461704 %optimizer_step 0.05542473881776179 %forward: 23.269751076383464 %backward: 62.43316252172736 [2025-04-02 20:47:03,637] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18766.37 | forward: 145498.78 | backward_microstep: 390384.28 | backward: 390375.85 | backward_inner_microstep: 390358.20 | backward_inner: 390352.17 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.61 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.19 | step: 346.55 | _step_clipping: 0.12 | _step_step: 344.93 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.377 | iteration 12260/ 143000 | elapsed time per iteration (ms): 62527.5 | learning rate: 5.916E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.459092E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 20:57:29,650] [INFO] [logging.py:60:log_dist] [Rank 0] step=12270, skipped=9, lr=[0.0005915470957152555, 0.0005915470957152555], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12270 loss: 2.4591 iter time (s): 62.601 samples/sec: 16.358 %comms: 0.0028691763671062484 %optimizer_step 0.056215557186818046 %forward: 23.224908107385826 %backward: 62.346846128631924 [2025-04-02 20:57:29,650] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19687.52 | forward: 145389.70 | backward_microstep: 390302.97 | backward: 390296.03 | backward_inner_microstep: 390280.26 | backward_inner: 390274.28 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 351.91 | _step_clipping: 0.13 | _step_step: 350.21 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.357 | iteration 12270/ 143000 | elapsed time per iteration (ms): 62601.3 | learning rate: 5.915E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.464231E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 21:08:01,907] [INFO] [logging.py:60:log_dist] [Rank 0] step=12280, skipped=9, lr=[0.0005915315536719496, 0.0005915315536719496], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12280 loss: 2.4529 iter time (s): 63.225 samples/sec: 16.196 %comms: 0.0028779832201345328 %optimizer_step 0.05498618477928813 %forward: 22.99093762829712 %backward: 61.72442208948687 [2025-04-02 21:08:01,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26009.54 | forward: 145360.66 | backward_microstep: 390260.51 | backward: 390253.88 | backward_inner_microstep: 390238.20 | backward_inner: 390232.41 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.78 | reduce_tied_grads: 0.26 | comms: 18.20 | reduce_grads: 0.18 | step: 347.65 | _step_clipping: 0.10 | _step_step: 346.06 | _step_zero_grad: 0.45 | _step_check_overflow: 0.49 samples/sec: 16.196 | iteration 12280/ 143000 | elapsed time per iteration (ms): 63225.8 | learning rate: 5.915E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.457904E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 21:18:21,008] [INFO] [logging.py:60:log_dist] [Rank 0] step=12290, skipped=9, lr=[0.0005915159975580237, 0.0005915159975580237], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12290 loss: 2.4776 iter time (s): 61.910 samples/sec: 16.540 %comms: 0.0028931238786458844 %optimizer_step 0.05716674145322895 %forward: 23.45809766164446 %backward: 63.025803660384874 [2025-04-02 21:18:21,009] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13076.40 | forward: 145227.99 | backward_microstep: 390196.16 | backward: 390189.83 | backward_inner_microstep: 390174.58 | backward_inner: 390168.96 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 353.92 | _step_clipping: 0.11 | _step_step: 352.25 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.540 | iteration 12290/ 143000 | elapsed time per iteration (ms): 61910.1 | learning rate: 5.915E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.459358E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 21:28:45,415] [INFO] [logging.py:60:log_dist] [Rank 0] step=12300, skipped=9, lr=[0.0005915004273742286, 0.0005915004273742286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12300 loss: 2.4619 iter time (s): 62.440 samples/sec: 16.400 %comms: 0.0028675458804648937 %optimizer_step 0.05584060575753009 %forward: 23.27774945844415 %backward: 62.49641770592782 [2025-04-02 21:28:45,415] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18194.19 | forward: 145346.60 | backward_microstep: 390235.17 | backward: 390228.51 | backward_inner_microstep: 390213.52 | backward_inner: 390207.78 | backward_allreduce_microstep: 7.24 | backward_allreduce: 2.48 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.18 | step: 348.67 | _step_clipping: 0.12 | _step_step: 347.06 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.400 | iteration 12300/ 143000 | elapsed time per iteration (ms): 62440.7 | learning rate: 5.915E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.457238E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 21:39:06,051] [INFO] [logging.py:60:log_dist] [Rank 0] step=12310, skipped=9, lr=[0.0005914848431213157, 0.0005914848431213157], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12310 loss: 2.4531 iter time (s): 62.063 samples/sec: 16.499 %comms: 0.002871173381107108 %optimizer_step 0.05510113628138663 %forward: 23.400334307765554 %backward: 62.87146967206314 [2025-04-02 21:39:06,052] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14588.33 | forward: 145229.83 | backward_microstep: 390207.38 | backward: 390200.11 | backward_inner_microstep: 390183.13 | backward_inner: 390177.41 | backward_allreduce_microstep: 9.13 | backward_allreduce: 4.18 | reduce_tied_grads: 0.24 | comms: 17.82 | reduce_grads: 0.18 | step: 341.97 | _step_clipping: 0.12 | _step_step: 340.21 | _step_zero_grad: 0.45 | _step_check_overflow: 0.48 samples/sec: 16.499 | iteration 12310/ 143000 | elapsed time per iteration (ms): 62063.6 | learning rate: 5.915E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.461919E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 21:49:30,912] [INFO] [logging.py:60:log_dist] [Rank 0] step=12320, skipped=9, lr=[0.0005914692448000371, 0.0005914692448000371], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12320 loss: 2.4643 iter time (s): 62.485 samples/sec: 16.388 %comms: 0.0028883207657306857 %optimizer_step 0.055969733127280805 %forward: 23.259969238029313 %backward: 62.45940615459619 [2025-04-02 21:49:30,912] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18561.88 | forward: 145341.02 | backward_microstep: 390288.05 | backward: 390280.57 | backward_inner_microstep: 390262.89 | backward_inner: 390256.98 | backward_allreduce_microstep: 9.60 | backward_allreduce: 4.60 | reduce_tied_grads: 0.29 | comms: 18.05 | reduce_grads: 0.19 | step: 349.73 | _step_clipping: 0.11 | _step_step: 348.00 | _step_zero_grad: 0.47 | _step_check_overflow: 0.59 samples/sec: 16.388 | iteration 12320/ 143000 | elapsed time per iteration (ms): 62486.0 | learning rate: 5.915E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.456650E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 22:00:01,167] [INFO] [logging.py:60:log_dist] [Rank 0] step=12330, skipped=9, lr=[0.0005914536324111459, 0.0005914536324111459], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12330 loss: 2.4613 iter time (s): 63.025 samples/sec: 16.248 %comms: 0.002843813339688495 %optimizer_step 0.055682073251937025 %forward: 23.078348328570982 %backward: 61.91752763139169 [2025-04-02 22:00:01,167] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23965.62 | forward: 145451.15 | backward_microstep: 390241.92 | backward: 390234.85 | backward_inner_microstep: 390219.06 | backward_inner: 390213.18 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.63 | reduce_tied_grads: 0.30 | comms: 17.92 | reduce_grads: 0.19 | step: 350.94 | _step_clipping: 0.12 | _step_step: 349.25 | _step_zero_grad: 0.51 | _step_check_overflow: 0.49 samples/sec: 16.247 | iteration 12330/ 143000 | elapsed time per iteration (ms): 63025.5 | learning rate: 5.915E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.458112E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 22:10:21,266] [INFO] [logging.py:60:log_dist] [Rank 0] step=12340, skipped=9, lr=[0.0005914380059553955, 0.0005914380059553955], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12340 loss: 2.4907 iter time (s): 62.009 samples/sec: 16.514 %comms: 0.002880580407085038 %optimizer_step 0.05529368675326605 %forward: 23.42428399167622 %backward: 62.931530476946726 [2025-04-02 22:10:21,267] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13975.28 | forward: 145252.69 | backward_microstep: 390241.60 | backward: 390234.94 | backward_inner_microstep: 390219.70 | backward_inner: 390213.94 | backward_allreduce_microstep: 7.33 | backward_allreduce: 2.51 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 342.87 | _step_clipping: 0.11 | _step_step: 341.27 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.513 | iteration 12340/ 143000 | elapsed time per iteration (ms): 62010.0 | learning rate: 5.914E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.463677E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 22:20:45,808] [INFO] [logging.py:60:log_dist] [Rank 0] step=12350, skipped=9, lr=[0.00059142236543354, 0.00059142236543354], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12350 loss: 2.4675 iter time (s): 62.454 samples/sec: 16.396 %comms: 0.002886094831862949 %optimizer_step 0.05604214294753527 %forward: 23.26329445822016 %backward: 62.492270488429725 [2025-04-02 22:20:45,808] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18294.54 | forward: 145287.50 | backward_microstep: 390293.49 | backward: 390286.33 | backward_inner_microstep: 390270.30 | backward_inner: 390264.26 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.83 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 350.00 | _step_clipping: 0.11 | _step_step: 348.18 | _step_zero_grad: 0.50 | _step_check_overflow: 0.64 samples/sec: 16.396 | iteration 12350/ 143000 | elapsed time per iteration (ms): 62454.1 | learning rate: 5.914E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.464075E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 22:31:10,924] [INFO] [logging.py:60:log_dist] [Rank 0] step=12360, skipped=9, lr=[0.0005914067108463343, 0.0005914067108463343], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12360 loss: 2.4645 iter time (s): 62.511 samples/sec: 16.381 %comms: 0.0028551365240211007 %optimizer_step 0.05658284154310076 %forward: 23.261681541517696 %backward: 62.44260758523734 [2025-04-02 22:31:10,925] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18669.18 | forward: 145411.36 | backward_microstep: 390343.70 | backward: 390335.69 | backward_inner_microstep: 390320.09 | backward_inner: 390312.28 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.54 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 353.71 | _step_clipping: 0.13 | _step_step: 352.04 | _step_zero_grad: 0.46 | _step_check_overflow: 0.54 samples/sec: 16.381 | iteration 12360/ 143000 | elapsed time per iteration (ms): 62511.7 | learning rate: 5.914E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.457785E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 22:41:36,072] [INFO] [logging.py:60:log_dist] [Rank 0] step=12370, skipped=9, lr=[0.0005913910421945341, 0.0005913910421945341], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12370 loss: 2.4542 iter time (s): 62.514 samples/sec: 16.380 %comms: 0.002862624479576393 %optimizer_step 0.05711854779677611 %forward: 23.26088573064358 %backward: 62.44752496003534 [2025-04-02 22:41:36,072] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18672.70 | forward: 145413.50 | backward_microstep: 390393.62 | backward: 390385.54 | backward_inner_microstep: 390369.95 | backward_inner: 390364.02 | backward_allreduce_microstep: 7.46 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 357.07 | _step_clipping: 0.11 | _step_step: 355.45 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.380 | iteration 12370/ 143000 | elapsed time per iteration (ms): 62514.7 | learning rate: 5.914E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.455665E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 22:52:07,068] [INFO] [logging.py:60:log_dist] [Rank 0] step=12380, skipped=9, lr=[0.0005913753594788956, 0.0005913753594788956], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12380 loss: 2.4374 iter time (s): 63.099 samples/sec: 16.228 %comms: 0.002851206639877055 %optimizer_step 0.05881619243941402 %forward: 23.058834576278038 %backward: 61.87785821147788 [2025-04-02 22:52:07,068] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24313.00 | forward: 145498.93 | backward_microstep: 390452.05 | backward: 390443.07 | backward_inner_microstep: 390427.00 | backward_inner: 390420.83 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.65 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.21 | step: 371.12 | _step_clipping: 0.13 | _step_step: 369.31 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.228 | iteration 12380/ 143000 | elapsed time per iteration (ms): 63099.6 | learning rate: 5.914E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.450311E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 23:02:31,607] [INFO] [logging.py:60:log_dist] [Rank 0] step=12390, skipped=9, lr=[0.0005913596627001756, 0.0005913596627001756], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12390 loss: 2.4705 iter time (s): 62.453 samples/sec: 16.396 %comms: 0.002885873267949886 %optimizer_step 0.05825318147208908 %forward: 23.273843106890688 %backward: 62.48971432125323 [2025-04-02 23:02:31,607] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18183.08 | forward: 145353.00 | backward_microstep: 390279.66 | backward: 390269.36 | backward_inner_microstep: 390252.91 | backward_inner: 390245.11 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.82 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 363.81 | _step_clipping: 0.13 | _step_step: 362.00 | _step_zero_grad: 0.57 | _step_check_overflow: 0.51 samples/sec: 16.396 | iteration 12390/ 143000 | elapsed time per iteration (ms): 62453.9 | learning rate: 5.914E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.455997E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 23:12:57,317] [INFO] [logging.py:60:log_dist] [Rank 0] step=12400, skipped=9, lr=[0.0005913439518591319, 0.0005913439518591319], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12400 loss: 2.4627 iter time (s): 62.570 samples/sec: 16.366 %comms: 0.002849836720980666 %optimizer_step 0.05597939556047969 %forward: 23.244669990287075 %backward: 62.34670369681044 [2025-04-02 23:12:57,317] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19520.87 | forward: 145443.01 | backward_microstep: 390113.55 | backward: 390106.29 | backward_inner_microstep: 390090.27 | backward_inner: 390084.38 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.61 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.18 | step: 350.27 | _step_clipping: 0.11 | _step_step: 348.64 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.365 | iteration 12400/ 143000 | elapsed time per iteration (ms): 62571.0 | learning rate: 5.913E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.468578E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 23:23:19,460] [INFO] [logging.py:60:log_dist] [Rank 0] step=12410, skipped=9, lr=[0.0005913282269565226, 0.0005913282269565226], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12410 loss: 2.4641 iter time (s): 62.214 samples/sec: 16.459 %comms: 0.0028773282277758418 %optimizer_step 0.05641306232700463 %forward: 23.364052482819442 %backward: 62.69206910334522 [2025-04-02 23:23:19,460] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16108.84 | forward: 145356.58 | backward_microstep: 390038.40 | backward: 390031.00 | backward_inner_microstep: 390013.67 | backward_inner: 390007.84 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.18 | step: 350.97 | _step_clipping: 0.11 | _step_step: 349.18 | _step_zero_grad: 0.45 | _step_check_overflow: 0.50 samples/sec: 16.459 | iteration 12410/ 143000 | elapsed time per iteration (ms): 62214.3 | learning rate: 5.913E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.461837E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 23:33:44,689] [INFO] [logging.py:60:log_dist] [Rank 0] step=12420, skipped=9, lr=[0.0005913124879931067, 0.0005913124879931067], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12420 loss: 2.4724 iter time (s): 62.522 samples/sec: 16.378 %comms: 0.0028723166289562506 %optimizer_step 0.05670250389247656 %forward: 23.231304445397 %backward: 62.384304561564086 [2025-04-02 23:33:44,689] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19276.01 | forward: 145247.61 | backward_microstep: 390050.33 | backward: 390041.42 | backward_inner_microstep: 390025.75 | backward_inner: 390019.95 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.19 | step: 354.52 | _step_clipping: 0.12 | _step_step: 352.77 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.378 | iteration 12420/ 143000 | elapsed time per iteration (ms): 62522.9 | learning rate: 5.913E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.452214E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 23:44:04,369] [INFO] [logging.py:60:log_dist] [Rank 0] step=12430, skipped=9, lr=[0.0005912967349696439, 0.0005912967349696439], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12430 loss: 2.4555 iter time (s): 61.967 samples/sec: 16.525 %comms: 0.0028886099230128436 %optimizer_step 0.05693644917620981 %forward: 23.421793157693372 %backward: 62.942531549580686 [2025-04-02 23:44:04,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13838.96 | forward: 145138.97 | backward_microstep: 390045.73 | backward: 390039.06 | backward_inner_microstep: 390023.92 | backward_inner: 390018.06 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.52 | reduce_tied_grads: 0.25 | comms: 17.90 | reduce_grads: 0.19 | step: 352.82 | _step_clipping: 0.11 | _step_step: 351.02 | _step_zero_grad: 0.64 | _step_check_overflow: 0.52 samples/sec: 16.525 | iteration 12430/ 143000 | elapsed time per iteration (ms): 61968.0 | learning rate: 5.913E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.455293E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-02 23:54:35,078] [INFO] [logging.py:60:log_dist] [Rank 0] step=12440, skipped=9, lr=[0.0005912809678868945, 0.0005912809678868945], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12440 loss: 2.4572 iter time (s): 63.070 samples/sec: 16.236 %comms: 0.0028426763259875015 %optimizer_step 0.05611581952229467 %forward: 23.058724527622847 %backward: 61.862732774200445 [2025-04-02 23:54:35,078] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24441.05 | forward: 145432.02 | backward_microstep: 390177.16 | backward: 390169.98 | backward_inner_microstep: 390154.21 | backward_inner: 390148.20 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.64 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.19 | step: 353.92 | _step_clipping: 0.12 | _step_step: 352.29 | _step_zero_grad: 0.50 | _step_check_overflow: 0.44 samples/sec: 16.236 | iteration 12440/ 143000 | elapsed time per iteration (ms): 63070.8 | learning rate: 5.913E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.461861E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 00:04:58,730] [INFO] [logging.py:60:log_dist] [Rank 0] step=12450, skipped=9, lr=[0.0005912651867456193, 0.0005912651867456193], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12450 loss: 2.4825 iter time (s): 62.365 samples/sec: 16.420 %comms: 0.0028634808309587405 %optimizer_step 0.055294020521587904 %forward: 23.309102192454173 %backward: 62.562483731662674 [2025-04-03 00:04:58,731] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17424.20 | forward: 145366.65 | backward_microstep: 390176.18 | backward: 390169.40 | backward_inner_microstep: 390153.71 | backward_inner: 390147.81 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.59 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.18 | step: 344.84 | _step_clipping: 0.12 | _step_step: 343.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.419 | iteration 12450/ 143000 | elapsed time per iteration (ms): 62365.3 | learning rate: 5.913E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.469619E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 00:15:23,990] [INFO] [logging.py:60:log_dist] [Rank 0] step=12460, skipped=9, lr=[0.0005912493915465802, 0.0005912493915465802], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12460 loss: 2.4529 iter time (s): 62.525 samples/sec: 16.377 %comms: 0.0028645875643362327 %optimizer_step 0.057284658835750714 %forward: 23.24061648755264 %backward: 62.40557316683159 [2025-04-03 00:15:23,991] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19048.33 | forward: 145312.96 | backward_microstep: 390201.67 | backward: 390193.54 | backward_inner_microstep: 390177.87 | backward_inner: 390171.89 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.60 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 358.17 | _step_clipping: 0.11 | _step_step: 356.45 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.377 | iteration 12460/ 143000 | elapsed time per iteration (ms): 62526.0 | learning rate: 5.912E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.454642E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 00:25:51,627] [INFO] [logging.py:60:log_dist] [Rank 0] step=12470, skipped=9, lr=[0.0005912335822905393, 0.0005912335822905393], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12470 loss: 2.4431 iter time (s): 62.763 samples/sec: 16.315 %comms: 0.002857691937569592 %optimizer_step 0.05619952736659332 %forward: 23.13705103216332 %backward: 62.146096042724366 [2025-04-03 00:25:51,627] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21672.34 | forward: 145215.24 | backward_microstep: 390056.82 | backward: 390048.00 | backward_inner_microstep: 390032.79 | backward_inner: 390026.98 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 352.73 | _step_clipping: 0.14 | _step_step: 350.97 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.315 | iteration 12470/ 143000 | elapsed time per iteration (ms): 62763.6 | learning rate: 5.912E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.459396E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 00:36:20,511] [INFO] [logging.py:60:log_dist] [Rank 0] step=12480, skipped=9, lr=[0.0005912177589782599, 0.0005912177589782599], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12480 loss: 2.4408 iter time (s): 62.888 samples/sec: 16.283 %comms: 0.002849971413164865 %optimizer_step 0.05600894813532292 %forward: 23.093199811102256 %backward: 62.01655078569823 [2025-04-03 00:36:20,512] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22979.03 | forward: 145228.34 | backward_microstep: 390018.53 | backward: 390009.21 | backward_inner_microstep: 389993.38 | backward_inner: 389987.47 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.63 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.19 | step: 352.23 | _step_clipping: 0.13 | _step_step: 350.49 | _step_zero_grad: 0.55 | _step_check_overflow: 0.49 samples/sec: 16.283 | iteration 12480/ 143000 | elapsed time per iteration (ms): 62888.5 | learning rate: 5.912E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.459196E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 00:46:50,309] [INFO] [logging.py:60:log_dist] [Rank 0] step=12490, skipped=9, lr=[0.0005912019216105057, 0.0005912019216105057], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12490 loss: 2.4715 iter time (s): 62.979 samples/sec: 16.259 %comms: 0.0028514065624466547 %optimizer_step 0.05600228965058496 %forward: 23.0747441427054 %backward: 61.94792930184235 [2025-04-03 00:46:50,309] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23594.44 | forward: 145322.86 | backward_microstep: 390152.66 | backward: 390143.00 | backward_inner_microstep: 390127.02 | backward_inner: 390120.75 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.19 | step: 352.70 | _step_clipping: 0.13 | _step_step: 351.01 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.259 | iteration 12490/ 143000 | elapsed time per iteration (ms): 62979.7 | learning rate: 5.912E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.459404E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 00:57:15,657] [INFO] [logging.py:60:log_dist] [Rank 0] step=12500, skipped=9, lr=[0.0005911860701880409, 0.0005911860701880409], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12500 loss: 2.4451 iter time (s): 62.534 samples/sec: 16.375 %comms: 0.002868528758269065 %optimizer_step 0.05641490726013698 %forward: 23.25259119281507 %backward: 62.39849827292421 [2025-04-03 00:57:15,658] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19009.89 | forward: 145408.40 | backward_microstep: 390212.23 | backward: 390204.49 | backward_inner_microstep: 390188.17 | backward_inner: 390182.00 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.69 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 352.79 | _step_clipping: 0.11 | _step_step: 351.13 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.375 | iteration 12500/ 143000 | elapsed time per iteration (ms): 62534.9 | learning rate: 5.912E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.454901E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 01:07:36,134] [INFO] [logging.py:60:log_dist] [Rank 0] step=12510, skipped=9, lr=[0.0005911702047116306, 0.0005911702047116306], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12510 loss: 2.4604 iter time (s): 62.047 samples/sec: 16.504 %comms: 0.002895049118334313 %optimizer_step 0.0570576977717596 %forward: 23.42761610155632 %backward: 62.93391036686003 [2025-04-03 01:07:36,134] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13852.94 | forward: 145361.50 | backward_microstep: 390496.83 | backward: 390486.50 | backward_inner_microstep: 390470.41 | backward_inner: 390464.27 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 17.96 | reduce_grads: 0.20 | step: 354.03 | _step_clipping: 0.13 | _step_step: 352.31 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.503 | iteration 12510/ 143000 | elapsed time per iteration (ms): 62047.6 | learning rate: 5.912E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.446715E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 01:18:07,173] [INFO] [logging.py:60:log_dist] [Rank 0] step=12520, skipped=9, lr=[0.0005911543251820406, 0.0005911543251820406], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12520 loss: 2.4691 iter time (s): 63.103 samples/sec: 16.227 %comms: 0.002843375647634608 %optimizer_step 0.05606887263432045 %forward: 23.056215944791692 %backward: 61.84414076719998 [2025-04-03 01:18:07,174] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24555.75 | forward: 145492.56 | backward_microstep: 390265.34 | backward: 390257.55 | backward_inner_microstep: 390242.01 | backward_inner: 390235.97 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 353.81 | _step_clipping: 0.13 | _step_step: 351.93 | _step_zero_grad: 0.46 | _step_check_overflow: 0.74 samples/sec: 16.227 | iteration 12520/ 143000 | elapsed time per iteration (ms): 63103.9 | learning rate: 5.912E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.451995E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 01:28:36,873] [INFO] [logging.py:60:log_dist] [Rank 0] step=12530, skipped=9, lr=[0.0005911384316000372, 0.0005911384316000372], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12530 loss: 2.4516 iter time (s): 62.969 samples/sec: 16.262 %comms: 0.0028509398648778467 %optimizer_step 0.05730852187949307 %forward: 23.084882120730143 %backward: 61.95165972914365 [2025-04-03 01:28:36,873] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23514.97 | forward: 145364.17 | backward_microstep: 390113.69 | backward: 390106.02 | backward_inner_microstep: 390090.27 | backward_inner: 390082.58 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.59 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.19 | step: 360.87 | _step_clipping: 0.15 | _step_step: 359.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.262 | iteration 12530/ 143000 | elapsed time per iteration (ms): 62970.0 | learning rate: 5.911E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.449953E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 01:38:57,479] [INFO] [logging.py:60:log_dist] [Rank 0] step=12540, skipped=9, lr=[0.0005911225239663878, 0.0005911225239663878], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12540 loss: 2.4328 iter time (s): 62.060 samples/sec: 16.500 %comms: 0.0028794592889940787 %optimizer_step 0.056547114411352864 %forward: 23.40044931036337 %backward: 62.85074534186328 [2025-04-03 01:38:57,480] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14628.37 | forward: 145223.39 | backward_microstep: 390059.80 | backward: 390052.27 | backward_inner_microstep: 390036.45 | backward_inner: 390030.33 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.59 | reduce_tied_grads: 0.28 | comms: 17.87 | reduce_grads: 0.19 | step: 350.93 | _step_clipping: 0.11 | _step_step: 349.24 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.500 | iteration 12540/ 143000 | elapsed time per iteration (ms): 62060.6 | learning rate: 5.911E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.446959E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 01:49:17,365] [INFO] [logging.py:60:log_dist] [Rank 0] step=12550, skipped=9, lr=[0.0005911066022818598, 0.0005911066022818598], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12550 loss: 2.4386 iter time (s): 61.988 samples/sec: 16.519 %comms: 0.0028822643974963456 %optimizer_step 0.05692416415173829 %forward: 23.412584469519185 %backward: 62.91408999958467 [2025-04-03 01:49:17,366] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14064.08 | forward: 145130.18 | backward_microstep: 389998.98 | backward: 389992.54 | backward_inner_microstep: 389976.95 | backward_inner: 389971.20 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.59 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 352.86 | _step_clipping: 0.11 | _step_step: 351.24 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.519 | iteration 12550/ 143000 | elapsed time per iteration (ms): 61988.6 | learning rate: 5.911E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.446433E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 01:59:43,091] [INFO] [logging.py:60:log_dist] [Rank 0] step=12560, skipped=9, lr=[0.0005910906665472218, 0.0005910906665472218], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12560 loss: 2.4543 iter time (s): 62.572 samples/sec: 16.365 %comms: 0.002861235519562311 %optimizer_step 0.05616426392897409 %forward: 23.207744400848682 %backward: 62.34409701886179 [2025-04-03 01:59:43,092] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19709.97 | forward: 145215.54 | backward_microstep: 390106.63 | backward: 390099.59 | backward_inner_microstep: 390084.05 | backward_inner: 390078.20 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.56 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 351.43 | _step_clipping: 0.09 | _step_step: 349.86 | _step_zero_grad: 0.48 | _step_check_overflow: 0.45 samples/sec: 16.365 | iteration 12560/ 143000 | elapsed time per iteration (ms): 62572.6 | learning rate: 5.911E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.455457E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 02:10:10,360] [INFO] [logging.py:60:log_dist] [Rank 0] step=12570, skipped=9, lr=[0.000591074716763243, 0.000591074716763243], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12570 loss: 2.4593 iter time (s): 62.726 samples/sec: 16.325 %comms: 0.0028570061974601403 %optimizer_step 0.05627921355923298 %forward: 23.177984340355483 %backward: 62.212736438300986 [2025-04-03 02:10:10,361] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20906.01 | forward: 145387.15 | backward_microstep: 390246.75 | backward: 390238.09 | backward_inner_microstep: 390222.39 | backward_inner: 390216.26 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.55 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 353.02 | _step_clipping: 0.12 | _step_step: 351.23 | _step_zero_grad: 0.47 | _step_check_overflow: 0.65 samples/sec: 16.325 | iteration 12570/ 143000 | elapsed time per iteration (ms): 62727.0 | learning rate: 5.911E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.457234E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 02:20:40,531] [INFO] [logging.py:60:log_dist] [Rank 0] step=12580, skipped=9, lr=[0.0005910587529306932, 0.0005910587529306932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12580 loss: 2.4422 iter time (s): 63.016 samples/sec: 16.250 %comms: 0.002850176714815424 %optimizer_step 0.05529236133814046 %forward: 23.06403196648453 %backward: 61.91463806660986 [2025-04-03 02:20:40,531] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23906.96 | forward: 145341.22 | backward_microstep: 390172.04 | backward: 390163.74 | backward_inner_microstep: 390146.42 | backward_inner: 390140.46 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.57 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 348.43 | _step_clipping: 0.14 | _step_step: 346.80 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.250 | iteration 12580/ 143000 | elapsed time per iteration (ms): 63017.0 | learning rate: 5.911E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.452600E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 02:31:05,320] [INFO] [logging.py:60:log_dist] [Rank 0] step=12590, skipped=9, lr=[0.0005910427750503427, 0.0005910427750503427], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12590 loss: 2.4444 iter time (s): 62.478 samples/sec: 16.390 %comms: 0.0028611341206790136 %optimizer_step 0.05615148863546724 %forward: 23.2386120316576 %backward: 62.43197270309304 [2025-04-03 02:31:05,321] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18825.27 | forward: 145191.14 | backward_microstep: 390071.89 | backward: 390065.01 | backward_inner_microstep: 390049.46 | backward_inner: 390043.55 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.53 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.18 | step: 350.83 | _step_clipping: 0.13 | _step_step: 349.23 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.390 | iteration 12590/ 143000 | elapsed time per iteration (ms): 62479.0 | learning rate: 5.910E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.445908E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 02:41:30,263] [INFO] [logging.py:60:log_dist] [Rank 0] step=12600, skipped=9, lr=[0.0005910267831229628, 0.0005910267831229628], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12600 loss: 2.4437 iter time (s): 62.494 samples/sec: 16.386 %comms: 0.0028629519439875377 %optimizer_step 0.056164682623827046 %forward: 23.23506929901975 %backward: 62.410861025245204 [2025-04-03 02:41:30,264] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18965.36 | forward: 145204.54 | backward_microstep: 390035.77 | backward: 390028.55 | backward_inner_microstep: 390013.24 | backward_inner: 390007.30 | backward_allreduce_microstep: 7.34 | backward_allreduce: 2.50 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 350.99 | _step_clipping: 0.12 | _step_step: 349.31 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.386 | iteration 12600/ 143000 | elapsed time per iteration (ms): 62494.3 | learning rate: 5.910E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.447663E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 02:51:59,548] [INFO] [logging.py:60:log_dist] [Rank 0] step=12610, skipped=9, lr=[0.0005910107771493255, 0.0005910107771493255], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12610 loss: 2.4736 iter time (s): 62.928 samples/sec: 16.273 %comms: 0.00288725790285085 %optimizer_step 0.05773000304086157 %forward: 23.12667483245711 %backward: 61.98740299447495 [2025-04-03 02:51:59,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22924.23 | forward: 145531.46 | backward_microstep: 390082.53 | backward: 390074.11 | backward_inner_microstep: 390058.43 | backward_inner: 390052.29 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.59 | reduce_tied_grads: 0.29 | comms: 18.17 | reduce_grads: 0.19 | step: 363.28 | _step_clipping: 0.12 | _step_step: 361.56 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.272 | iteration 12610/ 143000 | elapsed time per iteration (ms): 62928.5 | learning rate: 5.910E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.458687E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 03:02:25,725] [INFO] [logging.py:60:log_dist] [Rank 0] step=12620, skipped=9, lr=[0.000590994757130203, 0.000590994757130203], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12620 loss: 2.4850 iter time (s): 62.617 samples/sec: 16.353 %comms: 0.0028686956393013637 %optimizer_step 0.05644193509545389 %forward: 23.21447351903084 %backward: 62.30037446324456 [2025-04-03 03:02:25,725] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19959.25 | forward: 145362.24 | backward_microstep: 390114.68 | backward: 390106.71 | backward_inner_microstep: 390090.71 | backward_inner: 390084.41 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.21 | step: 353.42 | _step_clipping: 0.13 | _step_step: 351.72 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.353 | iteration 12620/ 143000 | elapsed time per iteration (ms): 62617.6 | learning rate: 5.910E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.456444E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 03:12:53,494] [INFO] [logging.py:60:log_dist] [Rank 0] step=12630, skipped=9, lr=[0.0005909787230663687, 0.0005909787230663687], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12630 loss: 2.4456 iter time (s): 62.776 samples/sec: 16.312 %comms: 0.0029490394610192086 %optimizer_step 0.05909711921650907 %forward: 23.15960342313596 %backward: 62.15161935786767 [2025-04-03 03:12:53,495] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21381.40 | forward: 145387.31 | backward_microstep: 390173.18 | backward: 390164.57 | backward_inner_microstep: 390148.58 | backward_inner: 390142.43 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.63 | reduce_tied_grads: 0.37 | comms: 18.51 | reduce_grads: 0.22 | step: 370.99 | _step_clipping: 0.15 | _step_step: 369.03 | _step_zero_grad: 0.52 | _step_check_overflow: 0.64 samples/sec: 16.312 | iteration 12630/ 143000 | elapsed time per iteration (ms): 62777.0 | learning rate: 5.910E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.450246E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 03:23:22,338] [INFO] [logging.py:60:log_dist] [Rank 0] step=12640, skipped=9, lr=[0.0005909626749585965, 0.0005909626749585965], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12640 loss: 2.4593 iter time (s): 62.884 samples/sec: 16.284 %comms: 0.002860204892007184 %optimizer_step 0.05577496220473867 %forward: 23.108617260285808 %backward: 62.03267500737785 [2025-04-03 03:23:22,339] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22699.28 | forward: 145315.80 | backward_microstep: 390092.08 | backward: 390085.12 | backward_inner_microstep: 390069.65 | backward_inner: 390063.77 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.19 | step: 350.73 | _step_clipping: 0.10 | _step_step: 349.01 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.284 | iteration 12640/ 143000 | elapsed time per iteration (ms): 62884.4 | learning rate: 5.910E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.454388E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 03:33:45,658] [INFO] [logging.py:60:log_dist] [Rank 0] step=12650, skipped=9, lr=[0.0005909466128076609, 0.0005909466128076609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12650 loss: 2.4521 iter time (s): 62.331 samples/sec: 16.428 %comms: 0.0028800416320259375 %optimizer_step 0.05595141162642455 %forward: 23.305036097140164 %backward: 62.56876530188266 [2025-04-03 03:33:45,659] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17335.22 | forward: 145263.75 | backward_microstep: 390006.86 | backward: 390000.40 | backward_inner_microstep: 389985.18 | backward_inner: 389979.51 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.55 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.20 | step: 348.75 | _step_clipping: 0.13 | _step_step: 347.07 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.428 | iteration 12650/ 143000 | elapsed time per iteration (ms): 62332.0 | learning rate: 5.909E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.449962E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 03:44:10,471] [INFO] [logging.py:60:log_dist] [Rank 0] step=12660, skipped=9, lr=[0.0005909305366143371, 0.0005909305366143371], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12660 loss: 2.4602 iter time (s): 62.481 samples/sec: 16.389 %comms: 0.0028735818810836523 %optimizer_step 0.05540087741742947 %forward: 23.241773783266872 %backward: 62.44201743596347 [2025-04-03 03:44:10,471] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18691.45 | forward: 145216.30 | backward_microstep: 390148.98 | backward: 390142.29 | backward_inner_microstep: 390125.21 | backward_inner: 390119.60 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.50 | reduce_tied_grads: 0.26 | comms: 17.95 | reduce_grads: 0.26 | step: 346.15 | _step_clipping: 0.12 | _step_step: 344.33 | _step_zero_grad: 0.47 | _step_check_overflow: 0.70 samples/sec: 16.389 | iteration 12660/ 143000 | elapsed time per iteration (ms): 62481.3 | learning rate: 5.909E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.449422E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 03:54:41,029] [INFO] [logging.py:60:log_dist] [Rank 0] step=12670, skipped=9, lr=[0.000590914446379401, 0.000590914446379401], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12670 loss: 2.4737 iter time (s): 63.055 samples/sec: 16.240 %comms: 0.0028366613051806866 %optimizer_step 0.05602464684874068 %forward: 23.073462873435822 %backward: 61.9067738888543 [2025-04-03 03:54:41,029] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23930.90 | forward: 145490.30 | backward_microstep: 390362.65 | backward: 390354.71 | backward_inner_microstep: 390339.00 | backward_inner: 390333.08 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 353.26 | _step_clipping: 0.12 | _step_step: 351.58 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.240 | iteration 12670/ 143000 | elapsed time per iteration (ms): 63055.8 | learning rate: 5.909E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.458871E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 04:05:11,335] [INFO] [logging.py:60:log_dist] [Rank 0] step=12680, skipped=9, lr=[0.0005908983421036291, 0.0005908983421036291], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12680 loss: 2.4359 iter time (s): 63.030 samples/sec: 16.246 %comms: 0.0028560638422916356 %optimizer_step 0.0556275219801669 %forward: 23.07459172877549 %backward: 61.937871669705345 [2025-04-03 04:05:11,336] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23628.95 | forward: 145439.35 | backward_microstep: 390403.31 | backward: 390394.93 | backward_inner_microstep: 390379.08 | backward_inner: 390372.98 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.60 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.19 | step: 350.62 | _step_clipping: 0.12 | _step_step: 348.94 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.246 | iteration 12680/ 143000 | elapsed time per iteration (ms): 63030.6 | learning rate: 5.909E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.445580E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 04:15:35,851] [INFO] [logging.py:60:log_dist] [Rank 0] step=12690, skipped=9, lr=[0.000590882223787799, 0.000590882223787799], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12690 loss: 2.4478 iter time (s): 62.451 samples/sec: 16.397 %comms: 0.0028675024327920267 %optimizer_step 0.05595773842532935 %forward: 23.271328977752734 %backward: 62.48984608993457 [2025-04-03 04:15:35,852] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18112.25 | forward: 145331.93 | backward_microstep: 390262.88 | backward: 390255.74 | backward_inner_microstep: 390240.30 | backward_inner: 390234.45 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.19 | step: 349.46 | _step_clipping: 0.12 | _step_step: 347.78 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.397 | iteration 12690/ 143000 | elapsed time per iteration (ms): 62451.6 | learning rate: 5.909E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.448543E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 04:26:00,130] [INFO] [logging.py:60:log_dist] [Rank 0] step=12700, skipped=9, lr=[0.0005908660914326882, 0.0005908660914326882], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12700 loss: 2.4429 iter time (s): 62.427 samples/sec: 16.403 %comms: 0.00286503708440826 %optimizer_step 0.055772744824611246 %forward: 23.26545836882872 %backward: 62.48777553337821 [2025-04-03 04:26:00,131] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18172.57 | forward: 145240.23 | backward_microstep: 390103.65 | backward: 390095.01 | backward_inner_microstep: 390079.79 | backward_inner: 390074.05 | backward_allreduce_microstep: 7.35 | backward_allreduce: 2.52 | reduce_tied_grads: 0.26 | comms: 17.89 | reduce_grads: 0.18 | step: 348.17 | _step_clipping: 0.12 | _step_step: 346.51 | _step_zero_grad: 0.46 | _step_check_overflow: 0.55 samples/sec: 16.403 | iteration 12700/ 143000 | elapsed time per iteration (ms): 62427.9 | learning rate: 5.909E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.447566E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 04:36:26,191] [INFO] [logging.py:60:log_dist] [Rank 0] step=12710, skipped=9, lr=[0.0005908499450390758, 0.0005908499450390758], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12710 loss: 2.4594 iter time (s): 62.605 samples/sec: 16.356 %comms: 0.0029047581112713987 %optimizer_step 0.05648407350832097 %forward: 23.219196129871282 %backward: 62.32290194867416 [2025-04-03 04:36:26,191] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19705.74 | forward: 145364.89 | backward_microstep: 390184.03 | backward: 390175.51 | backward_inner_microstep: 390159.77 | backward_inner: 390153.75 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.51 | reduce_tied_grads: 0.30 | comms: 18.19 | reduce_grads: 0.18 | step: 353.62 | _step_clipping: 0.11 | _step_step: 351.95 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.356 | iteration 12710/ 143000 | elapsed time per iteration (ms): 62606.0 | learning rate: 5.908E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.458809E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 04:46:55,673] [INFO] [logging.py:60:log_dist] [Rank 0] step=12720, skipped=9, lr=[0.0005908337846077408, 0.0005908337846077408], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12720 loss: 2.4651 iter time (s): 62.948 samples/sec: 16.267 %comms: 0.002848666226111641 %optimizer_step 0.0557349419594568 %forward: 23.104935028976207 %backward: 62.00147959445776 [2025-04-03 04:46:55,674] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22922.09 | forward: 145440.26 | backward_microstep: 390293.79 | backward: 390285.08 | backward_inner_microstep: 390269.34 | backward_inner: 390263.26 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.56 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.19 | step: 350.84 | _step_clipping: 0.13 | _step_step: 349.15 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.267 | iteration 12720/ 143000 | elapsed time per iteration (ms): 62948.3 | learning rate: 5.908E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.459638E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 04:57:27,099] [INFO] [logging.py:60:log_dist] [Rank 0] step=12730, skipped=9, lr=[0.0005908176101394631, 0.0005908176101394631], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12730 loss: 2.4639 iter time (s): 63.142 samples/sec: 16.217 %comms: 0.0028307252562030268 %optimizer_step 0.05544412362740505 %forward: 23.056627402127027 %backward: 61.82139850886268 [2025-04-03 04:57:27,099] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24633.40 | forward: 145584.15 | backward_microstep: 390361.16 | backward: 390352.64 | backward_inner_microstep: 390336.80 | backward_inner: 390330.71 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.27 | comms: 17.87 | reduce_grads: 0.18 | step: 350.09 | _step_clipping: 0.10 | _step_step: 348.42 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.217 | iteration 12730/ 143000 | elapsed time per iteration (ms): 63142.5 | learning rate: 5.908E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.457834E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 05:07:55,522] [INFO] [logging.py:60:log_dist] [Rank 0] step=12740, skipped=9, lr=[0.0005908014216350235, 0.0005908014216350235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12740 loss: 2.4746 iter time (s): 62.842 samples/sec: 16.295 %comms: 0.003245456464429715 %optimizer_step 0.05584132931438305 %forward: 23.12760004411496 %backward: 62.10379755120368 [2025-04-03 05:07:55,523] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21992.68 | forward: 145338.06 | backward_microstep: 390278.82 | backward: 390271.59 | backward_inner_microstep: 390256.00 | backward_inner: 390250.07 | backward_allreduce_microstep: 7.52 | backward_allreduce: 2.59 | reduce_tied_grads: 0.29 | comms: 20.40 | reduce_grads: 0.19 | step: 350.92 | _step_clipping: 0.12 | _step_step: 349.23 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.295 | iteration 12740/ 143000 | elapsed time per iteration (ms): 62842.4 | learning rate: 5.908E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.457203E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 05:18:20,723] [INFO] [logging.py:60:log_dist] [Rank 0] step=12750, skipped=9, lr=[0.0005907852190952033, 0.0005907852190952033], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12750 loss: 2.4509 iter time (s): 62.520 samples/sec: 16.379 %comms: 0.0036467784262976684 %optimizer_step 0.057302791788501416 %forward: 23.240552776193553 %backward: 62.397384800264746 [2025-04-03 05:18:20,724] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18992.69 | forward: 145298.87 | backward_microstep: 390114.65 | backward: 390105.60 | backward_inner_microstep: 390088.24 | backward_inner: 390082.34 | backward_allreduce_microstep: 9.21 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 22.80 | reduce_grads: 0.20 | step: 358.25 | _step_clipping: 0.12 | _step_step: 356.45 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.379 | iteration 12750/ 143000 | elapsed time per iteration (ms): 62520.1 | learning rate: 5.908E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.453124E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 05:28:45,694] [INFO] [logging.py:60:log_dist] [Rank 0] step=12760, skipped=9, lr=[0.0005907690025207845, 0.0005907690025207845], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12760 loss: 2.4388 iter time (s): 62.497 samples/sec: 16.385 %comms: 0.002934427959500609 %optimizer_step 0.05592236871492912 %forward: 23.238841563773537 %backward: 62.4109926808744 [2025-04-03 05:28:45,695] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18898.69 | forward: 145234.70 | backward_microstep: 390054.19 | backward: 390047.06 | backward_inner_microstep: 390031.49 | backward_inner: 390025.33 | backward_allreduce_microstep: 7.41 | backward_allreduce: 2.54 | reduce_tied_grads: 0.27 | comms: 18.34 | reduce_grads: 0.23 | step: 349.50 | _step_clipping: 0.11 | _step_step: 347.91 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.385 | iteration 12760/ 143000 | elapsed time per iteration (ms): 62497.1 | learning rate: 5.908E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.443373E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 05:39:10,580] [INFO] [logging.py:60:log_dist] [Rank 0] step=12770, skipped=9, lr=[0.0005907527719125498, 0.0005907527719125498], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12770 loss: 2.4370 iter time (s): 62.488 samples/sec: 16.387 %comms: 0.0028735125960060084 %optimizer_step 0.05604427420361533 %forward: 23.26187902735784 %backward: 62.44158090859674 [2025-04-03 05:39:10,581] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18493.48 | forward: 145358.93 | backward_microstep: 390192.75 | backward: 390185.23 | backward_inner_microstep: 390169.85 | backward_inner: 390164.01 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.56 | reduce_tied_grads: 0.27 | comms: 17.96 | reduce_grads: 0.18 | step: 350.21 | _step_clipping: 0.12 | _step_step: 348.42 | _step_zero_grad: 0.48 | _step_check_overflow: 0.64 samples/sec: 16.387 | iteration 12770/ 143000 | elapsed time per iteration (ms): 62488.6 | learning rate: 5.908E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.440993E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 05:49:34,615] [INFO] [logging.py:60:log_dist] [Rank 0] step=12780, skipped=9, lr=[0.0005907365272712825, 0.0005907365272712825], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12780 loss: 2.4485 iter time (s): 62.403 samples/sec: 16.409 %comms: 0.002915713416577288 %optimizer_step 0.055659885973992115 %forward: 23.293798848517817 %backward: 62.54759204887675 [2025-04-03 05:49:34,616] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17490.45 | forward: 145360.19 | backward_microstep: 390322.63 | backward: 390315.47 | backward_inner_microstep: 390299.61 | backward_inner: 390293.82 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 18.19 | reduce_grads: 0.21 | step: 347.33 | _step_clipping: 0.11 | _step_step: 345.60 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.409 | iteration 12780/ 143000 | elapsed time per iteration (ms): 62403.5 | learning rate: 5.907E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.441740E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 06:00:05,257] [INFO] [logging.py:60:log_dist] [Rank 0] step=12790, skipped=9, lr=[0.0005907202685977667, 0.0005907202685977667], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12790 loss: 2.4882 iter time (s): 63.064 samples/sec: 16.238 %comms: 0.0028636556849588224 %optimizer_step 0.058592128342755366 %forward: 23.06038038464799 %backward: 61.87283465149469 [2025-04-03 06:00:05,258] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24153.61 | forward: 145427.14 | backward_microstep: 390200.62 | backward: 390192.59 | backward_inner_microstep: 390176.73 | backward_inner: 390170.69 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.21 | step: 369.50 | _step_clipping: 0.13 | _step_step: 365.91 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.237 | iteration 12790/ 143000 | elapsed time per iteration (ms): 63064.3 | learning rate: 5.907E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.466212E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 06:10:24,569] [INFO] [logging.py:60:log_dist] [Rank 0] step=12800, skipped=9, lr=[0.0005907039958927871, 0.0005907039958927871], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12800 loss: 2.4465 iter time (s): 61.931 samples/sec: 16.535 %comms: 0.002888640156296429 %optimizer_step 0.057174548094491545 %forward: 23.450493848325607 %backward: 62.983807409881955 [2025-04-03 06:10:24,569] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13178.37 | forward: 145230.14 | backward_microstep: 390068.74 | backward: 390062.02 | backward_inner_microstep: 390046.72 | backward_inner: 390040.92 | backward_allreduce_microstep: 7.39 | backward_allreduce: 2.54 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 354.08 | _step_clipping: 0.12 | _step_step: 352.40 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.534 | iteration 12800/ 143000 | elapsed time per iteration (ms): 61931.1 | learning rate: 5.907E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.456003E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 06:20:49,009] [INFO] [logging.py:60:log_dist] [Rank 0] step=12810, skipped=9, lr=[0.0005906877091571293, 0.0005906877091571293], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12810 loss: 2.4771 iter time (s): 62.443 samples/sec: 16.399 %comms: 0.0028715585144094965 %optimizer_step 0.05604693613249636 %forward: 23.258248857953273 %backward: 62.471543624671476 [2025-04-03 06:20:49,009] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18271.07 | forward: 145232.39 | backward_microstep: 390101.23 | backward: 390093.49 | backward_inner_microstep: 390077.94 | backward_inner: 390072.01 | backward_allreduce_microstep: 7.43 | backward_allreduce: 2.56 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.18 | step: 349.98 | _step_clipping: 0.10 | _step_step: 348.30 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.399 | iteration 12810/ 143000 | elapsed time per iteration (ms): 62443.9 | learning rate: 5.907E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.456645E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 06:31:19,141] [INFO] [logging.py:60:log_dist] [Rank 0] step=12820, skipped=9, lr=[0.000590671408391579, 0.000590671408391579], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12820 loss: 2.4519 iter time (s): 63.013 samples/sec: 16.251 %comms: 0.0028454607055389894 %optimizer_step 0.055891259283618724 %forward: 23.0526719487653 %backward: 61.910353000713854 [2025-04-03 06:31:19,142] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23935.15 | forward: 145261.23 | backward_microstep: 390121.13 | backward: 390114.17 | backward_inner_microstep: 390098.31 | backward_inner: 390092.32 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.21 | step: 352.19 | _step_clipping: 0.13 | _step_step: 350.43 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.251 | iteration 12820/ 143000 | elapsed time per iteration (ms): 63013.3 | learning rate: 5.907E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.452479E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 06:41:39,703] [INFO] [logging.py:60:log_dist] [Rank 0] step=12830, skipped=9, lr=[0.0005906550935969231, 0.0005906550935969231], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12830 loss: 2.4518 iter time (s): 62.056 samples/sec: 16.501 %comms: 0.0029152062580657476 %optimizer_step 0.0558489618640157 %forward: 23.391790249055962 %backward: 62.85312723721175 [2025-04-03 06:41:39,704] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14531.82 | forward: 145159.15 | backward_microstep: 390045.57 | backward: 390038.83 | backward_inner_microstep: 390023.13 | backward_inner: 390017.25 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.63 | reduce_tied_grads: 0.29 | comms: 18.09 | reduce_grads: 0.19 | step: 346.57 | _step_clipping: 0.12 | _step_step: 344.90 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.501 | iteration 12830/ 143000 | elapsed time per iteration (ms): 62056.2 | learning rate: 5.907E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.452423E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 06:52:03,542] [INFO] [logging.py:60:log_dist] [Rank 0] step=12840, skipped=9, lr=[0.000590638764773949, 0.000590638764773949], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12840 loss: 2.4376 iter time (s): 62.383 samples/sec: 16.415 %comms: 0.002879442322397309 %optimizer_step 0.05590594231446892 %forward: 23.281698488634454 %backward: 62.52429891100082 [2025-04-03 06:52:03,543] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17700.20 | forward: 145239.09 | backward_microstep: 390055.02 | backward: 390047.67 | backward_inner_microstep: 390030.45 | backward_inner: 390024.39 | backward_allreduce_microstep: 9.20 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.19 | step: 348.76 | _step_clipping: 0.12 | _step_step: 347.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.414 | iteration 12840/ 143000 | elapsed time per iteration (ms): 62383.9 | learning rate: 5.906E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.450759E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 07:02:28,337] [INFO] [logging.py:60:log_dist] [Rank 0] step=12850, skipped=9, lr=[0.000590622421923445, 0.000590622421923445], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12850 loss: 2.4449 iter time (s): 62.479 samples/sec: 16.390 %comms: 0.002874352387073929 %optimizer_step 0.055822896772328066 %forward: 23.243171095346934 %backward: 62.436274356126354 [2025-04-03 07:02:28,338] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18628.77 | forward: 145220.80 | backward_microstep: 390102.08 | backward: 390095.05 | backward_inner_microstep: 390079.52 | backward_inner: 390073.67 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.18 | step: 348.78 | _step_clipping: 0.11 | _step_step: 347.09 | _step_zero_grad: 0.46 | _step_check_overflow: 0.57 samples/sec: 16.389 | iteration 12850/ 143000 | elapsed time per iteration (ms): 62479.5 | learning rate: 5.906E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.455976E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 07:12:52,502] [INFO] [logging.py:60:log_dist] [Rank 0] step=12860, skipped=9, lr=[0.0005906060650461997, 0.0005906060650461997], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12860 loss: 2.4523 iter time (s): 62.416 samples/sec: 16.406 %comms: 0.0028938331102777584 %optimizer_step 0.0589679518136167 %forward: 23.304325136114503 %backward: 62.558220137292 [2025-04-03 07:12:52,503] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17308.01 | forward: 145456.00 | backward_microstep: 390471.94 | backward: 390462.65 | backward_inner_microstep: 390445.79 | backward_inner: 390439.37 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.20 | step: 368.05 | _step_clipping: 0.13 | _step_step: 366.01 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.406 | iteration 12860/ 143000 | elapsed time per iteration (ms): 62416.5 | learning rate: 5.906E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.452226E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 07:23:15,146] [INFO] [logging.py:60:log_dist] [Rank 0] step=12870, skipped=9, lr=[0.0005905896941430025, 0.0005905896941430025], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12870 loss: 2.4543 iter time (s): 62.264 samples/sec: 16.446 %comms: 0.002875896424918608 %optimizer_step 0.05645119895166577 %forward: 23.34067291611092 %backward: 62.69087804749819 [2025-04-03 07:23:15,146] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16073.68 | forward: 145327.93 | backward_microstep: 390344.65 | backward: 390337.30 | backward_inner_microstep: 390321.82 | backward_inner: 390316.02 | backward_allreduce_microstep: 7.31 | backward_allreduce: 2.52 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.18 | step: 351.49 | _step_clipping: 0.11 | _step_step: 349.84 | _step_zero_grad: 0.45 | _step_check_overflow: 0.54 samples/sec: 16.446 | iteration 12870/ 143000 | elapsed time per iteration (ms): 62264.3 | learning rate: 5.906E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.449209E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 07:33:46,802] [INFO] [logging.py:60:log_dist] [Rank 0] step=12880, skipped=9, lr=[0.0005905733092146436, 0.0005905733092146436], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12880 loss: 2.4614 iter time (s): 63.165 samples/sec: 16.211 %comms: 0.0028345979845298716 %optimizer_step 0.055785996538120865 %forward: 23.039185216735603 %backward: 61.807102649429524 [2025-04-03 07:33:46,802] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24780.68 | forward: 145527.18 | backward_microstep: 390413.51 | backward: 390405.01 | backward_inner_microstep: 390387.57 | backward_inner: 390381.57 | backward_allreduce_microstep: 9.21 | backward_allreduce: 4.29 | reduce_tied_grads: 0.30 | comms: 17.90 | reduce_grads: 0.19 | step: 352.37 | _step_clipping: 0.12 | _step_step: 350.55 | _step_zero_grad: 0.51 | _step_check_overflow: 0.63 samples/sec: 16.211 | iteration 12880/ 143000 | elapsed time per iteration (ms): 63165.6 | learning rate: 5.906E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.452692E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 07:44:17,887] [INFO] [logging.py:60:log_dist] [Rank 0] step=12890, skipped=9, lr=[0.0005905569102619139, 0.0005905569102619139], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12890 loss: 2.4811 iter time (s): 63.108 samples/sec: 16.226 %comms: 0.0028406001622168606 %optimizer_step 0.055764189794243656 %forward: 23.082222724998054 %backward: 61.86643896459489 [2025-04-03 07:44:17,888] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24042.92 | forward: 145667.25 | backward_microstep: 390437.81 | backward: 390426.61 | backward_inner_microstep: 390409.37 | backward_inner: 390403.05 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.99 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.20 | step: 351.92 | _step_clipping: 0.14 | _step_step: 350.20 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.226 | iteration 12890/ 143000 | elapsed time per iteration (ms): 63108.6 | learning rate: 5.906E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.457185E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 07:54:45,941] [INFO] [logging.py:60:log_dist] [Rank 0] step=12900, skipped=9, lr=[0.0005905404972856049, 0.0005905404972856049], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12900 loss: 2.4589 iter time (s): 62.805 samples/sec: 16.305 %comms: 0.0028926572759463737 %optimizer_step 0.057358578149394684 %forward: 23.19892366955638 %backward: 62.20073974462551 [2025-04-03 07:54:45,943] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20719.14 | forward: 145700.22 | backward_microstep: 390662.35 | backward: 390650.10 | backward_inner_microstep: 390632.89 | backward_inner: 390626.39 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.21 | step: 360.24 | _step_clipping: 0.14 | _step_step: 358.11 | _step_zero_grad: 0.52 | _step_check_overflow: 0.68 samples/sec: 16.304 | iteration 12900/ 143000 | elapsed time per iteration (ms): 62805.5 | learning rate: 5.905E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.454357E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 08:05:16,529] [INFO] [logging.py:60:log_dist] [Rank 0] step=12910, skipped=9, lr=[0.0005905240702865085, 0.0005905240702865085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12910 loss: 2.4396 iter time (s): 63.058 samples/sec: 16.239 %comms: 0.0028556667897828274 %optimizer_step 0.05767624183270621 %forward: 23.071577967898545 %backward: 61.90571555272819 [2025-04-03 08:05:16,530] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23832.86 | forward: 145484.88 | backward_microstep: 390374.27 | backward: 390365.38 | backward_inner_microstep: 390348.88 | backward_inner: 390342.55 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.73 | reduce_tied_grads: 0.32 | comms: 18.01 | reduce_grads: 0.20 | step: 363.70 | _step_clipping: 0.13 | _step_step: 361.87 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.239 | iteration 12910/ 143000 | elapsed time per iteration (ms): 63058.7 | learning rate: 5.905E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.455597E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 08:15:43,549] [INFO] [logging.py:60:log_dist] [Rank 0] step=12920, skipped=9, lr=[0.0005905076292654179, 0.0005905076292654179], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12920 loss: 2.4430 iter time (s): 62.701 samples/sec: 16.331 %comms: 0.002882936933218736 %optimizer_step 0.055672760228052454 %forward: 23.236017555865338 %backward: 62.272490958442376 [2025-04-03 08:15:43,550] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19870.44 | forward: 145693.10 | backward_microstep: 390469.92 | backward: 390457.27 | backward_inner_microstep: 390440.86 | backward_inner: 390434.59 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.72 | reduce_tied_grads: 0.48 | comms: 18.08 | reduce_grads: 0.19 | step: 349.08 | _step_clipping: 0.11 | _step_step: 347.12 | _step_zero_grad: 0.71 | _step_check_overflow: 0.55 samples/sec: 16.331 | iteration 12920/ 143000 | elapsed time per iteration (ms): 62702.0 | learning rate: 5.905E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.449643E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 08:26:11,351] [INFO] [logging.py:60:log_dist] [Rank 0] step=12930, skipped=9, lr=[0.0005904911742231262, 0.0005904911742231262], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12930 loss: 2.4377 iter time (s): 62.780 samples/sec: 16.311 %comms: 0.0029081327046091396 %optimizer_step 0.058755590206854 %forward: 23.184134952730613 %backward: 62.1942830051529 [2025-04-03 08:26:11,352] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20834.79 | forward: 145549.07 | backward_microstep: 390461.66 | backward: 390453.23 | backward_inner_microstep: 390436.91 | backward_inner: 390430.64 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.68 | reduce_tied_grads: 0.35 | comms: 18.26 | reduce_grads: 0.22 | step: 368.87 | _step_clipping: 0.14 | _step_step: 366.73 | _step_zero_grad: 0.62 | _step_check_overflow: 0.70 samples/sec: 16.311 | iteration 12930/ 143000 | elapsed time per iteration (ms): 62780.2 | learning rate: 5.905E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.447169E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 08:36:40,131] [INFO] [logging.py:60:log_dist] [Rank 0] step=12940, skipped=9, lr=[0.000590474705160428, 0.000590474705160428], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12940 loss: 2.4419 iter time (s): 62.877 samples/sec: 16.286 %comms: 0.0028652732302047994 %optimizer_step 0.05605993433127434 %forward: 23.14473834197647 %backward: 62.10174782376109 [2025-04-03 08:36:40,132] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21803.83 | forward: 145528.15 | backward_microstep: 390491.27 | backward: 390479.79 | backward_inner_microstep: 390463.70 | backward_inner: 390457.38 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.65 | reduce_tied_grads: 0.30 | comms: 18.02 | reduce_grads: 0.19 | step: 352.49 | _step_clipping: 0.12 | _step_step: 350.74 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.286 | iteration 12940/ 143000 | elapsed time per iteration (ms): 62878.0 | learning rate: 5.905E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.446268E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 08:47:08,661] [INFO] [logging.py:60:log_dist] [Rank 0] step=12950, skipped=9, lr=[0.0005904582220781179, 0.0005904582220781179], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12950 loss: 2.4578 iter time (s): 62.852 samples/sec: 16.292 %comms: 0.0028655426208607105 %optimizer_step 0.05894227939873413 %forward: 23.156500370205105 %backward: 62.12193896893265 [2025-04-03 08:47:08,661] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21542.48 | forward: 145544.11 | backward_microstep: 390461.12 | backward: 390451.15 | backward_inner_microstep: 390434.58 | backward_inner: 390428.11 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.72 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.20 | step: 370.47 | _step_clipping: 0.13 | _step_step: 368.71 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.292 | iteration 12950/ 143000 | elapsed time per iteration (ms): 62852.9 | learning rate: 5.905E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.459334E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 08:57:36,858] [INFO] [logging.py:60:log_dist] [Rank 0] step=12960, skipped=9, lr=[0.0005904417249769917, 0.0005904417249769917], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12960 loss: 2.4479 iter time (s): 62.819 samples/sec: 16.301 %comms: 0.0028596917073732066 %optimizer_step 0.056291650525125546 %forward: 23.15446668517324 %backward: 62.14618568436957 [2025-04-03 08:57:36,859] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21396.47 | forward: 145454.58 | backward_microstep: 390405.45 | backward: 390397.55 | backward_inner_microstep: 390381.48 | backward_inner: 390375.27 | backward_allreduce_microstep: 7.71 | backward_allreduce: 2.65 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.19 | step: 353.62 | _step_clipping: 0.11 | _step_step: 351.80 | _step_zero_grad: 0.50 | _step_check_overflow: 0.64 samples/sec: 16.301 | iteration 12960/ 143000 | elapsed time per iteration (ms): 62819.8 | learning rate: 5.904E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.453341E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 09:08:05,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=12970, skipped=9, lr=[0.0005904252138578454, 0.0005904252138578454], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12970 loss: 2.4272 iter time (s): 62.822 samples/sec: 16.300 %comms: 0.002850437906365134 %optimizer_step 0.055789825608674126 %forward: 23.126077396181373 %backward: 62.10899979902415 [2025-04-03 09:08:05,081] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21855.09 | forward: 145281.70 | backward_microstep: 390185.42 | backward: 390178.61 | backward_inner_microstep: 390163.03 | backward_inner: 390157.21 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.19 | step: 350.48 | _step_clipping: 0.13 | _step_step: 348.71 | _step_zero_grad: 0.49 | _step_check_overflow: 0.60 samples/sec: 16.300 | iteration 12970/ 143000 | elapsed time per iteration (ms): 62822.1 | learning rate: 5.904E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.445321E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 09:18:33,505] [INFO] [logging.py:60:log_dist] [Rank 0] step=12980, skipped=9, lr=[0.0005904086887214758, 0.0005904086887214758], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12980 loss: 2.4551 iter time (s): 62.842 samples/sec: 16.295 %comms: 0.0028525149291431457 %optimizer_step 0.05701903648218441 %forward: 23.12454240930057 %backward: 62.08872173625394 [2025-04-03 09:18:33,505] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22011.80 | forward: 145318.97 | backward_microstep: 390184.85 | backward: 390177.18 | backward_inner_microstep: 390161.04 | backward_inner: 390153.39 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.22 | step: 358.32 | _step_clipping: 0.16 | _step_step: 356.68 | _step_zero_grad: 0.47 | _step_check_overflow: 0.47 samples/sec: 16.295 | iteration 12980/ 143000 | elapsed time per iteration (ms): 62842.4 | learning rate: 5.904E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.446922E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 09:29:00,199] [INFO] [logging.py:60:log_dist] [Rank 0] step=12990, skipped=9, lr=[0.0005903921495686809, 0.0005903921495686809], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 12990 loss: 2.4388 iter time (s): 62.669 samples/sec: 16.340 %comms: 0.0029348424586474902 %optimizer_step 0.057926606630946294 %forward: 23.226437757532587 %backward: 62.27742160956775 [2025-04-03 09:29:00,199] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19897.45 | forward: 145557.45 | backward_microstep: 390294.83 | backward: 390285.53 | backward_inner_microstep: 390264.04 | backward_inner: 390257.86 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.63 | reduce_tied_grads: 0.30 | comms: 18.39 | reduce_grads: 0.20 | step: 363.02 | _step_clipping: 0.13 | _step_step: 361.23 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.340 | iteration 12990/ 143000 | elapsed time per iteration (ms): 62669.4 | learning rate: 5.904E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.447577E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 09:39:33,904] [INFO] [logging.py:60:log_dist] [Rank 0] step=13000, skipped=9, lr=[0.0005903755964002586, 0.0005903755964002586], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13000 loss: 2.4444 iter time (s): 63.370 samples/sec: 16.159 %comms: 0.002827761659956442 %optimizer_step 0.05447670046009273 %forward: 22.95186454189423 %backward: 61.5670631170831 [2025-04-03 09:39:33,905] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27176.26 | forward: 145446.09 | backward_microstep: 390158.62 | backward: 390150.80 | backward_inner_microstep: 390135.09 | backward_inner: 390128.88 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.58 | reduce_tied_grads: 0.27 | comms: 17.92 | reduce_grads: 0.18 | step: 345.22 | _step_clipping: 0.12 | _step_step: 343.63 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.159 | iteration 13000/ 143000 | elapsed time per iteration (ms): 63370.6 | learning rate: 5.904E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.445211E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 09:39:37,037] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step13000/mp_rank_00_model_states.pt [2025-04-03 09:39:51,307] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-03 09:39:51,313] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step13000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-03 09:50:18,218] [INFO] [logging.py:60:log_dist] [Rank 0] step=13010, skipped=9, lr=[0.000590359029217008, 0.000590359029217008], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13010 loss: 2.4322 iter time (s): 62.689 samples/sec: 16.335 %comms: 0.002867681127545536 %optimizer_step 0.05951756144901802 %forward: 23.231824268427484 %backward: 62.33854271533483 [2025-04-03 09:50:18,219] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19422.23 | forward: 145638.24 | backward_microstep: 390806.96 | backward: 390794.77 | backward_inner_microstep: 390778.40 | backward_inner: 390771.90 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.63 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.19 | step: 373.11 | _step_clipping: 0.12 | _step_step: 371.36 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 15.893 | iteration 13010/ 143000 | elapsed time per iteration (ms): 64431.4 | learning rate: 5.904E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.445137E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 10:00:43,941] [INFO] [logging.py:60:log_dist] [Rank 0] step=13020, skipped=9, lr=[0.0005903424480197285, 0.0005903424480197285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13020 loss: 2.4485 iter time (s): 62.572 samples/sec: 16.365 %comms: 0.002868908068680965 %optimizer_step 0.05572810009416133 %forward: 23.236772475031543 %backward: 62.389773329799866 [2025-04-03 10:00:43,941] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18968.92 | forward: 145396.47 | backward_microstep: 390393.30 | backward: 390383.52 | backward_inner_microstep: 390367.52 | backward_inner: 390361.37 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.67 | reduce_tied_grads: 0.32 | comms: 17.95 | reduce_grads: 0.19 | step: 348.70 | _step_clipping: 0.13 | _step_step: 347.05 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.365 | iteration 13020/ 143000 | elapsed time per iteration (ms): 62572.3 | learning rate: 5.903E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.442748E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 10:11:08,886] [INFO] [logging.py:60:log_dist] [Rank 0] step=13030, skipped=9, lr=[0.0005903258528092206, 0.0005903258528092206], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13030 loss: 2.4658 iter time (s): 62.494 samples/sec: 16.386 %comms: 0.002906431443769951 %optimizer_step 0.057403842708291644 %forward: 23.258066492443785 %backward: 62.46495179805171 [2025-04-03 10:11:08,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18239.07 | forward: 145348.88 | backward_microstep: 390377.05 | backward: 390368.25 | backward_inner_microstep: 390351.99 | backward_inner: 390345.54 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.66 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.21 | step: 358.74 | _step_clipping: 0.13 | _step_step: 356.90 | _step_zero_grad: 0.49 | _step_check_overflow: 0.62 samples/sec: 16.385 | iteration 13030/ 143000 | elapsed time per iteration (ms): 62494.5 | learning rate: 5.903E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.447982E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 10:21:39,445] [INFO] [logging.py:60:log_dist] [Rank 0] step=13040, skipped=9, lr=[0.0005903092435862852, 0.0005903092435862852], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13040 loss: 2.4575 iter time (s): 63.055 samples/sec: 16.240 %comms: 0.002895528573600705 %optimizer_step 0.056174063483254974 %forward: 23.05243756911402 %backward: 61.884270994900916 [2025-04-03 10:21:39,446] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24015.53 | forward: 145357.95 | backward_microstep: 390222.52 | backward: 390213.43 | backward_inner_microstep: 390197.04 | backward_inner: 390190.75 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.69 | reduce_tied_grads: 0.51 | comms: 18.26 | reduce_grads: 0.21 | step: 354.21 | _step_clipping: 0.12 | _step_step: 352.52 | _step_zero_grad: 0.49 | _step_check_overflow: 0.49 samples/sec: 16.240 | iteration 13040/ 143000 | elapsed time per iteration (ms): 63055.9 | learning rate: 5.903E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.446641E+00 | loss scale: 1048576.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 10:32:08,661] [INFO] [logging.py:60:log_dist] [Rank 0] step=13050, skipped=9, lr=[0.0005902926203517238, 0.0005902926203517238], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13050 loss: 2.4420 iter time (s): 62.921 samples/sec: 16.274 %comms: 0.0028566943177506625 %optimizer_step 0.05605147161212796 %forward: 23.102159169259213 %backward: 62.02187566106478 [2025-04-03 10:32:08,662] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22580.21 | forward: 145361.17 | backward_microstep: 390256.32 | backward: 390248.03 | backward_inner_microstep: 390228.31 | backward_inner: 390220.38 | backward_allreduce_microstep: 11.31 | backward_allreduce: 2.71 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 1.89 | step: 352.68 | _step_clipping: 0.14 | _step_step: 350.86 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.274 | iteration 13050/ 143000 | elapsed time per iteration (ms): 62921.6 | learning rate: 5.903E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.451927E+00 | loss scale: 1048576.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 10:38:25,285] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 1048576.0 [2025-04-03 10:39:27,657] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1048576.0, reducing to 524288.0 [2025-04-03 10:42:35,065] [INFO] [logging.py:60:log_dist] [Rank 0] step=13060, skipped=11, lr=[0.0005902793116762434, 0.0005902793116762434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13060 loss: 2.4375 iter time (s): 62.640 samples/sec: 16.347 %comms: 0.0023040347252494277 %optimizer_step 0.04815396036415962 %forward: 23.224411781007987 %backward: 62.339815272365925 [2025-04-03 10:42:35,065] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19436.09 | forward: 145477.28 | backward_microstep: 390505.58 | backward: 390495.43 | backward_inner_microstep: 390479.22 | backward_inner: 390472.97 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.65 | reduce_tied_grads: 0.29 | comms: 14.43 | reduce_grads: 0.18 | step: 301.64 | _step_clipping: 0.13 | _step_step: 299.98 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.347 | iteration 13060/ 143000 | elapsed time per iteration (ms): 62640.4 | learning rate: 5.903E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.446841E+00 | loss scale: 524288.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-03 10:53:03,425] [INFO] [logging.py:60:log_dist] [Rank 0] step=13070, skipped=11, lr=[0.0005902626632227777, 0.0005902626632227777], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13070 loss: 2.4377 iter time (s): 62.835 samples/sec: 16.297 %comms: 0.002848973057562875 %optimizer_step 0.057661871492074815 %forward: 23.14544440695515 %backward: 62.121543880799166 [2025-04-03 10:53:03,426] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21567.26 | forward: 145435.51 | backward_microstep: 390352.16 | backward: 390343.70 | backward_inner_microstep: 390327.85 | backward_inner: 390321.73 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.58 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.18 | step: 362.32 | _step_clipping: 0.12 | _step_step: 360.69 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.296 | iteration 13070/ 143000 | elapsed time per iteration (ms): 62836.0 | learning rate: 5.903E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.441722E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 11:03:40,673] [INFO] [logging.py:60:log_dist] [Rank 0] step=13080, skipped=11, lr=[0.0005902460007599344, 0.0005902460007599344], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13080 loss: 2.4380 iter time (s): 63.724 samples/sec: 16.069 %comms: 0.0028424989212944072 %optimizer_step 0.05654090053623098 %forward: 22.861642497156694 %backward: 61.29379672373268 [2025-04-03 11:03:40,674] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29884.28 | forward: 145684.12 | backward_microstep: 390602.66 | backward: 390590.16 | backward_inner_microstep: 390573.38 | backward_inner: 390566.85 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 360.30 | _step_clipping: 0.11 | _step_step: 358.60 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.069 | iteration 13080/ 143000 | elapsed time per iteration (ms): 63724.8 | learning rate: 5.902E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.441152E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 11:14:08,435] [INFO] [logging.py:60:log_dist] [Rank 0] step=13090, skipped=11, lr=[0.0005902293242885176, 0.0005902293242885176], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13090 loss: 2.4739 iter time (s): 62.776 samples/sec: 16.312 %comms: 0.0028495987770049205 %optimizer_step 0.05752619460443736 %forward: 23.149794205664133 %backward: 62.16454190781887 [2025-04-03 11:14:08,436] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21170.14 | forward: 145324.37 | backward_microstep: 390250.19 | backward: 390242.05 | backward_inner_microstep: 390226.21 | backward_inner: 390220.10 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.19 | step: 361.12 | _step_clipping: 0.11 | _step_step: 359.49 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.312 | iteration 13090/ 143000 | elapsed time per iteration (ms): 62776.2 | learning rate: 5.902E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.446914E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 11:24:38,275] [INFO] [logging.py:60:log_dist] [Rank 0] step=13100, skipped=11, lr=[0.0005902126338093322, 0.0005902126338093322], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13100 loss: 2.4441 iter time (s): 62.983 samples/sec: 16.258 %comms: 0.0028481485836689135 %optimizer_step 0.05528754561924235 %forward: 23.107989819879627 %backward: 61.97197193835792 [2025-04-03 11:24:38,275] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22975.58 | forward: 145542.02 | backward_microstep: 390330.36 | backward: 390320.67 | backward_inner_microstep: 390304.53 | backward_inner: 390298.25 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 17.94 | reduce_grads: 0.19 | step: 348.22 | _step_clipping: 0.13 | _step_step: 346.50 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.258 | iteration 13100/ 143000 | elapsed time per iteration (ms): 62983.9 | learning rate: 5.902E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.445277E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 11:35:04,906] [INFO] [logging.py:60:log_dist] [Rank 0] step=13110, skipped=11, lr=[0.0005901959293231837, 0.0005901959293231837], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13110 loss: 2.4395 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.0028559220822211378 %optimizer_step 0.055694913197103624 %forward: 23.217660247803245 %backward: 62.29359294565657 [2025-04-03 11:35:04,906] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19754.49 | forward: 145487.79 | backward_microstep: 390359.90 | backward: 390347.56 | backward_inner_microstep: 390329.58 | backward_inner: 390323.34 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.60 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.19 | step: 349.00 | _step_clipping: 0.13 | _step_step: 347.36 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.341 | iteration 13110/ 143000 | elapsed time per iteration (ms): 62663.1 | learning rate: 5.902E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.437729E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 11:45:29,738] [INFO] [logging.py:60:log_dist] [Rank 0] step=13120, skipped=11, lr=[0.0005901792108308784, 0.0005901792108308784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13120 loss: 2.4332 iter time (s): 62.483 samples/sec: 16.389 %comms: 0.0028682245513586713 %optimizer_step 0.055472030607191726 %forward: 23.27237388995726 %backward: 62.46637414235858 [2025-04-03 11:45:29,739] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18078.26 | forward: 145412.13 | backward_microstep: 390315.46 | backward: 390306.91 | backward_inner_microstep: 390290.38 | backward_inner: 390284.25 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.69 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.19 | step: 346.60 | _step_clipping: 0.13 | _step_step: 344.93 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.388 | iteration 13120/ 143000 | elapsed time per iteration (ms): 62483.3 | learning rate: 5.902E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.439459E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 11:55:55,340] [INFO] [logging.py:60:log_dist] [Rank 0] step=13130, skipped=11, lr=[0.0005901624783332232, 0.0005901624783332232], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13130 loss: 2.4408 iter time (s): 62.560 samples/sec: 16.368 %comms: 0.0028719016557852443 %optimizer_step 0.05576894059720268 %forward: 23.24380550769471 %backward: 62.40258070606451 [2025-04-03 11:55:55,340] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18744.33 | forward: 145412.37 | backward_microstep: 390396.71 | backward: 390388.20 | backward_inner_microstep: 390371.98 | backward_inner: 390365.72 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.66 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 348.89 | _step_clipping: 0.12 | _step_step: 347.17 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.368 | iteration 13130/ 143000 | elapsed time per iteration (ms): 62560.1 | learning rate: 5.902E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.446141E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 12:06:18,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=13140, skipped=11, lr=[0.0005901457318310255, 0.0005901457318310255], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13140 loss: 2.4324 iter time (s): 62.286 samples/sec: 16.440 %comms: 0.0028730569511188765 %optimizer_step 0.056911270747562095 %forward: 23.346266139461154 %backward: 62.6633329498648 [2025-04-03 12:06:18,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16096.50 | forward: 145413.42 | backward_microstep: 390308.83 | backward: 390301.80 | backward_inner_microstep: 390285.68 | backward_inner: 390279.71 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.69 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.19 | step: 354.47 | _step_clipping: 0.12 | _step_step: 352.80 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.440 | iteration 13140/ 143000 | elapsed time per iteration (ms): 62286.0 | learning rate: 5.901E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.438775E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 12:16:43,794] [INFO] [logging.py:60:log_dist] [Rank 0] step=13150, skipped=11, lr=[0.0005901289713250939, 0.0005901289713250939], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13150 loss: 2.4737 iter time (s): 62.559 samples/sec: 16.369 %comms: 0.0028716318561207168 %optimizer_step 0.05651861906238244 %forward: 23.25240911215432 %backward: 62.39795678932535 [2025-04-03 12:16:43,795] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18738.20 | forward: 145464.42 | backward_microstep: 390361.97 | backward: 390354.50 | backward_inner_microstep: 390338.67 | backward_inner: 390332.67 | backward_allreduce_microstep: 7.65 | backward_allreduce: 2.62 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.19 | step: 353.57 | _step_clipping: 0.11 | _step_step: 351.86 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.368 | iteration 13150/ 143000 | elapsed time per iteration (ms): 62559.4 | learning rate: 5.901E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.445724E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 12:27:09,728] [INFO] [logging.py:60:log_dist] [Rank 0] step=13160, skipped=11, lr=[0.0005901121968162372, 0.0005901121968162372], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13160 loss: 2.4441 iter time (s): 62.593 samples/sec: 16.360 %comms: 0.002879176711417844 %optimizer_step 0.05643701336507598 %forward: 23.245386582308694 %backward: 62.37228783166913 [2025-04-03 12:27:09,729] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18959.52 | forward: 145499.47 | backward_microstep: 390415.75 | backward: 390405.84 | backward_inner_microstep: 390389.40 | backward_inner: 390382.97 | backward_allreduce_microstep: 7.73 | backward_allreduce: 2.65 | reduce_tied_grads: 0.30 | comms: 18.02 | reduce_grads: 0.20 | step: 353.26 | _step_clipping: 0.12 | _step_step: 351.25 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.360 | iteration 13160/ 143000 | elapsed time per iteration (ms): 62593.4 | learning rate: 5.901E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.454558E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 12:37:38,960] [INFO] [logging.py:60:log_dist] [Rank 0] step=13170, skipped=11, lr=[0.0005900954083052648, 0.0005900954083052648], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13170 loss: 2.4412 iter time (s): 62.923 samples/sec: 16.274 %comms: 0.002847753475100319 %optimizer_step 0.05537525347168541 %forward: 23.133618143161407 %backward: 62.05201301589717 [2025-04-03 12:37:38,961] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22122.55 | forward: 145562.90 | backward_microstep: 390457.10 | backward: 390447.83 | backward_inner_microstep: 390432.22 | backward_inner: 390426.19 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.56 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.21 | step: 348.44 | _step_clipping: 0.13 | _step_step: 346.79 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.274 | iteration 13170/ 143000 | elapsed time per iteration (ms): 62923.2 | learning rate: 5.901E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.444404E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 12:48:04,958] [INFO] [logging.py:60:log_dist] [Rank 0] step=13180, skipped=11, lr=[0.0005900786057929873, 0.0005900786057929873], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13180 loss: 2.4525 iter time (s): 62.599 samples/sec: 16.358 %comms: 0.002863913122409517 %optimizer_step 0.05585432309175009 %forward: 23.22014686430617 %backward: 62.354220285476636 [2025-04-03 12:48:04,959] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19258.78 | forward: 145356.41 | backward_microstep: 390340.35 | backward: 390332.82 | backward_inner_microstep: 390317.16 | backward_inner: 390311.33 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.54 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.20 | step: 349.64 | _step_clipping: 0.12 | _step_step: 347.94 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.358 | iteration 13180/ 143000 | elapsed time per iteration (ms): 62599.8 | learning rate: 5.901E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.433957E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 12:58:29,739] [INFO] [logging.py:60:log_dist] [Rank 0] step=13190, skipped=11, lr=[0.0005900617892802155, 0.0005900617892802155], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13190 loss: 2.4532 iter time (s): 62.478 samples/sec: 16.390 %comms: 0.002875064722108349 %optimizer_step 0.056159946106173184 %forward: 23.281443514003787 %backward: 62.48126681595508 [2025-04-03 12:58:29,740] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17858.30 | forward: 145456.71 | backward_microstep: 390375.39 | backward: 390367.52 | backward_inner_microstep: 390351.81 | backward_inner: 390345.62 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.20 | step: 350.87 | _step_clipping: 0.12 | _step_step: 349.12 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.390 | iteration 13190/ 143000 | elapsed time per iteration (ms): 62478.1 | learning rate: 5.901E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.443135E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 13:08:56,504] [INFO] [logging.py:60:log_dist] [Rank 0] step=13200, skipped=11, lr=[0.000590044958767761, 0.000590044958767761], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13200 loss: 2.4552 iter time (s): 62.676 samples/sec: 16.338 %comms: 0.002905753320401755 %optimizer_step 0.056942410620950595 %forward: 23.224424620834103 %backward: 62.29247361476644 [2025-04-03 13:08:56,505] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19697.83 | forward: 145561.25 | backward_microstep: 390433.71 | backward: 390423.89 | backward_inner_microstep: 390406.94 | backward_inner: 390399.93 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.76 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 356.89 | _step_clipping: 0.12 | _step_step: 355.05 | _step_zero_grad: 0.54 | _step_check_overflow: 0.57 samples/sec: 16.338 | iteration 13200/ 143000 | elapsed time per iteration (ms): 62676.5 | learning rate: 5.900E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.447668E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 13:19:20,159] [INFO] [logging.py:60:log_dist] [Rank 0] step=13210, skipped=11, lr=[0.0005900281142564364, 0.0005900281142564364], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13210 loss: 2.4311 iter time (s): 62.364 samples/sec: 16.420 %comms: 0.003340082465262844 %optimizer_step 0.061139932741119375 %forward: 23.33909345403017 %backward: 62.65198747675518 [2025-04-03 13:19:20,160] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16206.76 | forward: 145552.37 | backward_microstep: 390736.83 | backward: 390724.06 | backward_inner_microstep: 390706.96 | backward_inner: 390700.56 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.88 | reduce_tied_grads: 0.31 | comms: 20.83 | reduce_grads: 0.20 | step: 381.29 | _step_clipping: 0.16 | _step_step: 379.11 | _step_zero_grad: 0.57 | _step_check_overflow: 0.81 samples/sec: 16.419 | iteration 13210/ 143000 | elapsed time per iteration (ms): 62365.5 | learning rate: 5.900E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.442770E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 13:29:46,788] [INFO] [logging.py:60:log_dist] [Rank 0] step=13220, skipped=11, lr=[0.0005900112557470543, 0.0005900112557470543], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13220 loss: 2.4562 iter time (s): 62.662 samples/sec: 16.342 %comms: 0.0028756073320314927 %optimizer_step 0.056360998160508484 %forward: 23.23056495322787 %backward: 62.32473945119131 [2025-04-03 13:29:46,788] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19406.08 | forward: 145567.92 | backward_microstep: 390551.88 | backward: 390540.77 | backward_inner_microstep: 390524.63 | backward_inner: 390518.30 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 18.02 | reduce_grads: 0.19 | step: 353.17 | _step_clipping: 0.10 | _step_step: 351.36 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.341 | iteration 13220/ 143000 | elapsed time per iteration (ms): 62662.8 | learning rate: 5.900E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.444651E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 13:40:13,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=13230, skipped=11, lr=[0.0005899943832404286, 0.0005899943832404286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13230 loss: 2.4320 iter time (s): 62.621 samples/sec: 16.352 %comms: 0.0029621748422166264 %optimizer_step 0.056704239641088464 %forward: 23.26191715535435 %backward: 62.406029565257626 [2025-04-03 13:40:13,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18598.93 | forward: 145668.51 | backward_microstep: 390808.43 | backward: 390792.95 | backward_inner_microstep: 390776.26 | backward_inner: 390769.65 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.66 | reduce_tied_grads: 0.34 | comms: 18.55 | reduce_grads: 0.21 | step: 355.09 | _step_clipping: 0.13 | _step_step: 353.35 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.352 | iteration 13230/ 143000 | elapsed time per iteration (ms): 62621.6 | learning rate: 5.900E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.445666E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 13:50:49,766] [INFO] [logging.py:60:log_dist] [Rank 0] step=13240, skipped=11, lr=[0.0005899774967373736, 0.0005899774967373736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13240 loss: 2.4386 iter time (s): 63.676 samples/sec: 16.082 %comms: 0.0028890759691635893 %optimizer_step 0.05565638609164636 %forward: 22.86622375267415 %backward: 61.32254672182973 [2025-04-03 13:50:49,767] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29577.19 | forward: 145602.15 | backward_microstep: 390487.97 | backward: 390475.27 | backward_inner_microstep: 390458.80 | backward_inner: 390452.51 | backward_allreduce_microstep: 7.82 | backward_allreduce: 2.69 | reduce_tied_grads: 0.29 | comms: 18.40 | reduce_grads: 0.19 | step: 354.40 | _step_clipping: 0.11 | _step_step: 352.63 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.081 | iteration 13240/ 143000 | elapsed time per iteration (ms): 63676.3 | learning rate: 5.900E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.441441E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 14:01:17,044] [INFO] [logging.py:60:log_dist] [Rank 0] step=13250, skipped=11, lr=[0.0005899605962387043, 0.0005899605962387043], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13250 loss: 2.4446 iter time (s): 62.727 samples/sec: 16.325 %comms: 0.002858147461124091 %optimizer_step 0.057368424775693205 %forward: 23.223944261885006 %backward: 62.24335511892004 [2025-04-03 14:01:17,045] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20061.57 | forward: 145677.33 | backward_microstep: 390444.70 | backward: 390435.21 | backward_inner_microstep: 390418.97 | backward_inner: 390412.78 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.81 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.20 | step: 359.86 | _step_clipping: 0.12 | _step_step: 358.19 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.325 | iteration 13250/ 143000 | elapsed time per iteration (ms): 62727.8 | learning rate: 5.900E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.447447E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 14:11:46,559] [INFO] [logging.py:60:log_dist] [Rank 0] step=13260, skipped=11, lr=[0.0005899436817452363, 0.0005899436817452363], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13260 loss: 2.4550 iter time (s): 62.951 samples/sec: 16.267 %comms: 0.0028963164873171865 %optimizer_step 0.055925878076556926 %forward: 23.11817936744957 %backward: 62.02515788229337 [2025-04-03 14:11:46,560] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22422.42 | forward: 145531.11 | backward_microstep: 390464.44 | backward: 390454.19 | backward_inner_microstep: 390437.99 | backward_inner: 390431.55 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.59 | reduce_tied_grads: 0.32 | comms: 18.23 | reduce_grads: 0.19 | step: 352.06 | _step_clipping: 0.12 | _step_step: 350.26 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.266 | iteration 13260/ 143000 | elapsed time per iteration (ms): 62951.5 | learning rate: 5.899E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.447398E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 14:22:09,026] [INFO] [logging.py:60:log_dist] [Rank 0] step=13270, skipped=11, lr=[0.0005899267532577862, 0.0005899267532577862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13270 loss: 2.4346 iter time (s): 62.246 samples/sec: 16.451 %comms: 0.002891611774751007 %optimizer_step 0.05576416658154183 %forward: 23.347418437844823 %backward: 62.68981004452458 [2025-04-03 14:22:09,027] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15842.82 | forward: 145328.71 | backward_microstep: 390228.91 | backward: 390219.98 | backward_inner_microstep: 390204.02 | backward_inner: 390197.91 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.67 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.19 | step: 347.11 | _step_clipping: 0.12 | _step_step: 345.42 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.451 | iteration 13270/ 143000 | elapsed time per iteration (ms): 62246.7 | learning rate: 5.899E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.444494E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 14:32:35,893] [INFO] [logging.py:60:log_dist] [Rank 0] step=13280, skipped=11, lr=[0.0005899098107771709, 0.0005899098107771709], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13280 loss: 2.4383 iter time (s): 62.686 samples/sec: 16.335 %comms: 0.0028930306806468483 %optimizer_step 0.05630207011419843 %forward: 23.181857392278708 %backward: 62.23545457017746 [2025-04-03 14:32:35,895] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20334.32 | forward: 145318.25 | backward_microstep: 390138.61 | backward: 390130.40 | backward_inner_microstep: 390114.72 | backward_inner: 390108.45 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.59 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.21 | step: 352.94 | _step_clipping: 0.14 | _step_step: 350.99 | _step_zero_grad: 0.51 | _step_check_overflow: 0.70 samples/sec: 16.335 | iteration 13280/ 143000 | elapsed time per iteration (ms): 62686.8 | learning rate: 5.899E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.443701E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 14:43:06,361] [INFO] [logging.py:60:log_dist] [Rank 0] step=13290, skipped=11, lr=[0.0005898928543042082, 0.0005898928543042082], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13290 loss: 2.4409 iter time (s): 63.046 samples/sec: 16.242 %comms: 0.0036629488026876543 %optimizer_step 0.05921118677187505 %forward: 23.09675005239029 %backward: 61.929855463362806 [2025-04-03 14:43:06,362] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23246.27 | forward: 145615.99 | backward_microstep: 390456.27 | backward: 390443.56 | backward_inner_microstep: 390426.93 | backward_inner: 390418.52 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.72 | reduce_tied_grads: 0.31 | comms: 23.09 | reduce_grads: 0.20 | step: 373.30 | _step_clipping: 0.12 | _step_step: 371.46 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.242 | iteration 13290/ 143000 | elapsed time per iteration (ms): 63046.7 | learning rate: 5.899E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.434925E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 14:53:32,861] [INFO] [logging.py:60:log_dist] [Rank 0] step=13300, skipped=11, lr=[0.0005898758838397162, 0.0005898758838397162], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13300 loss: 2.4281 iter time (s): 62.649 samples/sec: 16.345 %comms: 0.0028696142118700848 %optimizer_step 0.05706146325280138 %forward: 23.217196209497835 %backward: 62.31553800270668 [2025-04-03 14:53:32,861] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19505.41 | forward: 145454.28 | backward_microstep: 390415.11 | backward: 390402.94 | backward_inner_microstep: 390386.61 | backward_inner: 390380.36 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.64 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.19 | step: 357.49 | _step_clipping: 0.13 | _step_step: 355.74 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.345 | iteration 13300/ 143000 | elapsed time per iteration (ms): 62649.9 | learning rate: 5.899E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.433068E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 15:04:01,611] [INFO] [logging.py:60:log_dist] [Rank 0] step=13310, skipped=11, lr=[0.0005898588993845144, 0.0005898588993845144], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13310 loss: 2.4450 iter time (s): 62.875 samples/sec: 16.286 %comms: 0.0028396582024696622 %optimizer_step 0.05876313284565025 %forward: 23.110710111467867 %backward: 62.04709088855702 [2025-04-03 15:04:01,612] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22256.14 | forward: 145307.47 | backward_microstep: 390126.27 | backward: 390118.09 | backward_inner_microstep: 390100.51 | backward_inner: 390094.48 | backward_allreduce_microstep: 9.43 | backward_allreduce: 2.58 | reduce_tied_grads: 0.25 | comms: 17.85 | reduce_grads: 0.18 | step: 369.47 | _step_clipping: 0.13 | _step_step: 367.84 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.286 | iteration 13310/ 143000 | elapsed time per iteration (ms): 62875.1 | learning rate: 5.899E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.431262E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 15:14:35,196] [INFO] [logging.py:60:log_dist] [Rank 0] step=13320, skipped=11, lr=[0.0005898419009394223, 0.0005898419009394223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13320 loss: 2.4441 iter time (s): 63.358 samples/sec: 16.162 %comms: 0.002841213750227628 %optimizer_step 0.05527826457166444 %forward: 22.971293850328664 %backward: 61.60202054246273 [2025-04-03 15:14:35,196] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26624.09 | forward: 145541.17 | backward_microstep: 390309.98 | backward: 390297.13 | backward_inner_microstep: 390280.30 | backward_inner: 390273.87 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.20 | step: 350.23 | _step_clipping: 0.14 | _step_step: 348.57 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.162 | iteration 13320/ 143000 | elapsed time per iteration (ms): 63358.4 | learning rate: 5.898E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.438946E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 15:25:04,010] [INFO] [logging.py:60:log_dist] [Rank 0] step=13330, skipped=11, lr=[0.0005898248885052603, 0.0005898248885052603], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13330 loss: 2.4550 iter time (s): 62.881 samples/sec: 16.285 %comms: 0.00288816690382612 %optimizer_step 0.05915833941197683 %forward: 23.138919645098344 %backward: 62.069814576863834 [2025-04-03 15:25:04,011] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21875.26 | forward: 145499.65 | backward_microstep: 390310.73 | backward: 390300.69 | backward_inner_microstep: 390284.59 | backward_inner: 390278.32 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.65 | reduce_tied_grads: 0.37 | comms: 18.16 | reduce_grads: 0.28 | step: 371.99 | _step_clipping: 0.17 | _step_step: 370.12 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.285 | iteration 13330/ 143000 | elapsed time per iteration (ms): 62881.5 | learning rate: 5.898E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.440988E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 15:35:28,206] [INFO] [logging.py:60:log_dist] [Rank 0] step=13340, skipped=11, lr=[0.0005898078620828497, 0.0005898078620828497], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13340 loss: 2.4210 iter time (s): 62.419 samples/sec: 16.405 %comms: 0.0028887602351407057 %optimizer_step 0.05610460603822316 %forward: 23.314653347157982 %backward: 62.549821445576704 [2025-04-03 15:35:28,207] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17099.04 | forward: 145527.78 | backward_microstep: 390439.36 | backward: 390429.86 | backward_inner_microstep: 390407.75 | backward_inner: 390401.61 | backward_allreduce_microstep: 11.88 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 18.03 | reduce_grads: 0.21 | step: 350.20 | _step_clipping: 0.11 | _step_step: 348.42 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.405 | iteration 13340/ 143000 | elapsed time per iteration (ms): 62419.6 | learning rate: 5.898E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.437195E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 15:45:54,175] [INFO] [logging.py:60:log_dist] [Rank 0] step=13350, skipped=11, lr=[0.000589790821673012, 0.000589790821673012], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13350 loss: 2.4215 iter time (s): 62.596 samples/sec: 16.359 %comms: 0.0030067294090015736 %optimizer_step 0.05871537960009446 %forward: 23.208784375262578 %backward: 62.334756035754104 [2025-04-03 15:45:54,176] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19373.75 | forward: 145278.31 | backward_microstep: 390203.10 | backward: 390192.25 | backward_inner_microstep: 390176.06 | backward_inner: 390169.78 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.63 | reduce_tied_grads: 0.31 | comms: 18.82 | reduce_grads: 0.20 | step: 367.54 | _step_clipping: 0.13 | _step_step: 365.50 | _step_zero_grad: 0.58 | _step_check_overflow: 0.67 samples/sec: 16.359 | iteration 13350/ 143000 | elapsed time per iteration (ms): 62596.9 | learning rate: 5.898E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.443418E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 15:56:20,628] [INFO] [logging.py:60:log_dist] [Rank 0] step=13360, skipped=11, lr=[0.0005897737672765699, 0.0005897737672765699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13360 loss: 2.4661 iter time (s): 62.645 samples/sec: 16.346 %comms: 0.0028861157038156757 %optimizer_step 0.05637841852137787 %forward: 23.198459036401648 %backward: 62.262196266716906 [2025-04-03 15:56:20,629] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19998.84 | forward: 145326.14 | backward_microstep: 390049.98 | backward: 390039.90 | backward_inner_microstep: 390023.75 | backward_inner: 390017.24 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.19 | step: 353.18 | _step_clipping: 0.11 | _step_step: 351.45 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.346 | iteration 13360/ 143000 | elapsed time per iteration (ms): 62645.3 | learning rate: 5.898E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.442810E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 16:06:46,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=13370, skipped=11, lr=[0.0005897566988943463, 0.0005897566988943463], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13370 loss: 2.4520 iter time (s): 62.634 samples/sec: 16.349 %comms: 0.0028804262064223035 %optimizer_step 0.059425967622904016 %forward: 23.22984836848716 %backward: 62.297240035067524 [2025-04-03 16:06:46,970] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19510.79 | forward: 145496.80 | backward_microstep: 390202.83 | backward: 390189.77 | backward_inner_microstep: 390173.19 | backward_inner: 390164.77 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.19 | step: 372.21 | _step_clipping: 0.12 | _step_step: 370.43 | _step_zero_grad: 0.47 | _step_check_overflow: 0.62 samples/sec: 16.349 | iteration 13370/ 143000 | elapsed time per iteration (ms): 62634.1 | learning rate: 5.898E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.449297E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 16:17:12,455] [INFO] [logging.py:60:log_dist] [Rank 0] step=13380, skipped=11, lr=[0.0005897396165271652, 0.0005897396165271652], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13380 loss: 2.4312 iter time (s): 62.548 samples/sec: 16.371 %comms: 0.0028603876745345066 %optimizer_step 0.05527823730507245 %forward: 23.24471560005961 %backward: 62.35113072686455 [2025-04-03 16:17:12,456] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19016.73 | forward: 145391.18 | backward_microstep: 390003.27 | backward: 389994.20 | backward_inner_microstep: 389978.41 | backward_inner: 389972.30 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.18 | step: 345.75 | _step_clipping: 0.12 | _step_step: 344.06 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.371 | iteration 13380/ 143000 | elapsed time per iteration (ms): 62548.6 | learning rate: 5.897E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.441972E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 16:27:44,867] [INFO] [logging.py:60:log_dist] [Rank 0] step=13390, skipped=11, lr=[0.000589722520175851, 0.000589722520175851], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13390 loss: 2.4439 iter time (s): 63.241 samples/sec: 16.192 %comms: 0.002844104738435214 %optimizer_step 0.055608015770355686 %forward: 22.95936567956147 %backward: 61.66604432962569 [2025-04-03 16:27:44,868] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26153.37 | forward: 145196.48 | backward_microstep: 389988.13 | backward: 389979.95 | backward_inner_microstep: 389964.24 | backward_inner: 389958.19 | backward_allreduce_microstep: 7.55 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.19 | step: 351.67 | _step_clipping: 0.14 | _step_step: 349.98 | _step_zero_grad: 0.50 | _step_check_overflow: 0.48 samples/sec: 16.192 | iteration 13390/ 143000 | elapsed time per iteration (ms): 63241.2 | learning rate: 5.897E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.448113E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 16:38:22,100] [INFO] [logging.py:60:log_dist] [Rank 0] step=13400, skipped=11, lr=[0.0005897054098412287, 0.0005897054098412287], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13400 loss: 2.4421 iter time (s): 63.723 samples/sec: 16.070 %comms: 0.0028201561135159027 %optimizer_step 0.056423251543804905 %forward: 22.79153150090246 %backward: 61.23261348033201 [2025-04-03 16:38:22,100] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30672.38 | forward: 145233.83 | backward_microstep: 390201.60 | backward: 390190.84 | backward_inner_microstep: 390174.46 | backward_inner: 390168.18 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.19 | step: 359.54 | _step_clipping: 0.14 | _step_step: 357.88 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.069 | iteration 13400/ 143000 | elapsed time per iteration (ms): 63723.3 | learning rate: 5.897E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.443273E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 16:48:43,662] [INFO] [logging.py:60:log_dist] [Rank 0] step=13410, skipped=11, lr=[0.0005896882855241243, 0.0005896882855241243], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13410 loss: 2.4721 iter time (s): 62.156 samples/sec: 16.475 %comms: 0.0028945924249021036 %optimizer_step 0.055737647568004416 %forward: 23.388516958217824 %backward: 62.78142300570207 [2025-04-03 16:48:43,663] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14823.70 | forward: 145372.97 | backward_microstep: 390231.06 | backward: 390222.35 | backward_inner_microstep: 390206.59 | backward_inner: 390200.45 | backward_allreduce_microstep: 7.51 | backward_allreduce: 2.57 | reduce_tied_grads: 0.27 | comms: 17.99 | reduce_grads: 0.19 | step: 346.44 | _step_clipping: 0.11 | _step_step: 344.76 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.475 | iteration 13410/ 143000 | elapsed time per iteration (ms): 62156.3 | learning rate: 5.897E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.445309E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 16:59:09,345] [INFO] [logging.py:60:log_dist] [Rank 0] step=13420, skipped=11, lr=[0.0005896711472253642, 0.0005896711472253642], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13420 loss: 2.4622 iter time (s): 62.568 samples/sec: 16.366 %comms: 0.0028768239591569007 %optimizer_step 0.05656550918071038 %forward: 23.22389330546447 %backward: 62.36200293658353 [2025-04-03 16:59:09,346] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19059.09 | forward: 145306.75 | backward_microstep: 390198.21 | backward: 390185.21 | backward_inner_microstep: 390169.25 | backward_inner: 390163.18 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.61 | reduce_tied_grads: 0.27 | comms: 18.00 | reduce_grads: 0.19 | step: 353.92 | _step_clipping: 0.11 | _step_step: 352.08 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.366 | iteration 13420/ 143000 | elapsed time per iteration (ms): 62568.3 | learning rate: 5.897E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.452056E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 17:09:32,424] [INFO] [logging.py:60:log_dist] [Rank 0] step=13430, skipped=11, lr=[0.0005896539949457756, 0.0005896539949457756], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13430 loss: 2.4385 iter time (s): 62.307 samples/sec: 16.435 %comms: 0.003150047767140363 %optimizer_step 0.057728656546037656 %forward: 23.330400870314094 %backward: 62.66682405706625 [2025-04-03 17:09:32,424] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15998.90 | forward: 145365.41 | backward_microstep: 390476.43 | backward: 390460.00 | backward_inner_microstep: 390441.53 | backward_inner: 390435.06 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.64 | reduce_tied_grads: 1.91 | comms: 19.63 | reduce_grads: 0.19 | step: 359.69 | _step_clipping: 0.13 | _step_step: 357.85 | _step_zero_grad: 0.50 | _step_check_overflow: 0.64 samples/sec: 16.435 | iteration 13430/ 143000 | elapsed time per iteration (ms): 62307.9 | learning rate: 5.897E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.445470E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 17:20:03,450] [INFO] [logging.py:60:log_dist] [Rank 0] step=13440, skipped=11, lr=[0.0005896368286861864, 0.0005896368286861864], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13440 loss: 2.4278 iter time (s): 63.102 samples/sec: 16.228 %comms: 0.002851028025285845 %optimizer_step 0.05573333219601529 %forward: 23.048961399711747 %backward: 61.83039750317501 [2025-04-03 17:20:03,451] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24294.08 | forward: 145443.82 | backward_microstep: 390171.01 | backward: 390162.87 | backward_inner_microstep: 390146.13 | backward_inner: 390139.67 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 351.69 | _step_clipping: 0.14 | _step_step: 349.95 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.228 | iteration 13440/ 143000 | elapsed time per iteration (ms): 63102.7 | learning rate: 5.896E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.444483E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 17:30:24,234] [INFO] [logging.py:60:log_dist] [Rank 0] step=13450, skipped=11, lr=[0.0005896196484474251, 0.0005896196484474251], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13450 loss: 2.4455 iter time (s): 62.078 samples/sec: 16.495 %comms: 0.0028945730801626427 %optimizer_step 0.05605694281191117 %forward: 23.388572573748505 %backward: 62.82566466609869 [2025-04-03 17:30:24,235] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14467.36 | forward: 145191.28 | backward_microstep: 390016.05 | backward: 390008.34 | backward_inner_microstep: 389991.62 | backward_inner: 389985.50 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.95 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.20 | step: 347.99 | _step_clipping: 0.12 | _step_step: 346.33 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.495 | iteration 13450/ 143000 | elapsed time per iteration (ms): 62078.4 | learning rate: 5.896E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.434470E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 17:40:52,523] [INFO] [logging.py:60:log_dist] [Rank 0] step=13460, skipped=11, lr=[0.0005896024542303208, 0.0005896024542303208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13460 loss: 2.4487 iter time (s): 62.828 samples/sec: 16.298 %comms: 0.002849522194694276 %optimizer_step 0.05499375575125552 %forward: 23.152130659477297 %backward: 62.08832383715354 [2025-04-03 17:40:52,524] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21584.56 | forward: 145461.11 | backward_microstep: 390098.71 | backward: 390090.95 | backward_inner_microstep: 390075.17 | backward_inner: 390069.21 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.62 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.19 | step: 345.52 | _step_clipping: 0.11 | _step_step: 343.90 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.298 | iteration 13460/ 143000 | elapsed time per iteration (ms): 62828.9 | learning rate: 5.896E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.435668E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 17:51:15,769] [INFO] [logging.py:60:log_dist] [Rank 0] step=13470, skipped=11, lr=[0.0005895852460357036, 0.0005895852460357036], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13470 loss: 2.4527 iter time (s): 62.324 samples/sec: 16.430 %comms: 0.002872504604058061 %optimizer_step 0.05620356379441839 %forward: 23.327911412831746 %backward: 62.57852123065134 [2025-04-03 17:51:15,771] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16718.50 | forward: 145389.00 | backward_microstep: 390021.88 | backward: 390014.70 | backward_inner_microstep: 389998.68 | backward_inner: 389992.73 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.65 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.20 | step: 350.28 | _step_clipping: 0.11 | _step_step: 348.61 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.430 | iteration 13470/ 143000 | elapsed time per iteration (ms): 62324.8 | learning rate: 5.896E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.437865E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 18:01:42,823] [INFO] [logging.py:60:log_dist] [Rank 0] step=13480, skipped=11, lr=[0.0005895680238644038, 0.0005895680238644038], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13480 loss: 2.4465 iter time (s): 62.704 samples/sec: 16.331 %comms: 0.0028673605190912646 %optimizer_step 0.0586543514193965 %forward: 23.208070644201257 %backward: 62.20737513094937 [2025-04-03 18:01:42,823] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20299.93 | forward: 145524.89 | backward_microstep: 390075.67 | backward: 390067.83 | backward_inner_microstep: 390051.41 | backward_inner: 390045.20 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.21 | step: 367.79 | _step_clipping: 0.11 | _step_step: 366.05 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.330 | iteration 13480/ 143000 | elapsed time per iteration (ms): 62705.1 | learning rate: 5.896E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.447232E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 18:12:04,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=13490, skipped=11, lr=[0.0005895507877172526, 0.0005895507877172526], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13490 loss: 2.4454 iter time (s): 62.138 samples/sec: 16.479 %comms: 0.0028746861835448442 %optimizer_step 0.0562909305996793 %forward: 23.366131117923043 %backward: 62.77580425360334 [2025-04-03 18:12:04,211] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14986.81 | forward: 145193.05 | backward_microstep: 390084.52 | backward: 390077.86 | backward_inner_microstep: 390062.14 | backward_inner: 390056.12 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.62 | reduce_tied_grads: 0.26 | comms: 17.86 | reduce_grads: 0.19 | step: 349.78 | _step_clipping: 0.11 | _step_step: 348.17 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 16.479 | iteration 13490/ 143000 | elapsed time per iteration (ms): 62138.8 | learning rate: 5.896E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.438233E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 18:22:25,727] [INFO] [logging.py:60:log_dist] [Rank 0] step=13500, skipped=11, lr=[0.0005895335375950821, 0.0005895335375950821], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13500 loss: 2.4433 iter time (s): 62.151 samples/sec: 16.476 %comms: 0.0028737098913364906 %optimizer_step 0.05553316238792515 %forward: 23.370750969264364 %backward: 62.76240784501349 [2025-04-03 18:22:25,728] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15063.74 | forward: 145251.70 | backward_microstep: 390082.22 | backward: 390075.04 | backward_inner_microstep: 390059.21 | backward_inner: 390053.30 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.61 | reduce_tied_grads: 0.28 | comms: 17.86 | reduce_grads: 0.19 | step: 345.14 | _step_clipping: 0.10 | _step_step: 343.37 | _step_zero_grad: 0.46 | _step_check_overflow: 0.68 samples/sec: 16.476 | iteration 13500/ 143000 | elapsed time per iteration (ms): 62151.7 | learning rate: 5.895E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.440063E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 18:32:47,366] [INFO] [logging.py:60:log_dist] [Rank 0] step=13510, skipped=11, lr=[0.0005895162734987248, 0.0005895162734987248], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13510 loss: 2.4389 iter time (s): 62.163 samples/sec: 16.473 %comms: 0.002867117263795138 %optimizer_step 0.055428543064062565 %forward: 23.36592156102735 %backward: 62.74965460489758 [2025-04-03 18:32:47,367] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15162.86 | forward: 145250.55 | backward_microstep: 390079.78 | backward: 390073.30 | backward_inner_microstep: 390057.69 | backward_inner: 390051.83 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.60 | reduce_tied_grads: 0.25 | comms: 17.82 | reduce_grads: 0.18 | step: 344.56 | _step_clipping: 0.11 | _step_step: 343.00 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.473 | iteration 13510/ 143000 | elapsed time per iteration (ms): 62163.9 | learning rate: 5.895E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.439135E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 18:43:13,859] [INFO] [logging.py:60:log_dist] [Rank 0] step=13520, skipped=11, lr=[0.0005894989954290138, 0.0005894989954290138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13520 loss: 2.4451 iter time (s): 62.649 samples/sec: 16.345 %comms: 0.0028520981064085564 %optimizer_step 0.05529231958918023 %forward: 23.204274102830915 %backward: 62.26853892273402 [2025-04-03 18:43:13,860] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19844.37 | forward: 145371.91 | backward_microstep: 390111.66 | backward: 390104.70 | backward_inner_microstep: 390089.03 | backward_inner: 390083.18 | backward_allreduce_microstep: 7.60 | backward_allreduce: 2.62 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.19 | step: 346.40 | _step_clipping: 0.10 | _step_step: 344.76 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.345 | iteration 13520/ 143000 | elapsed time per iteration (ms): 62649.3 | learning rate: 5.895E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.434742E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 18:53:39,905] [INFO] [logging.py:60:log_dist] [Rank 0] step=13530, skipped=11, lr=[0.0005894817033867832, 0.0005894817033867832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13530 loss: 2.4649 iter time (s): 62.604 samples/sec: 16.357 %comms: 0.002902959034898873 %optimizer_step 0.05835387144669899 %forward: 23.237307267480368 %backward: 62.345651750143794 [2025-04-03 18:53:39,906] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19041.82 | forward: 145474.92 | backward_microstep: 390320.26 | backward: 390308.93 | backward_inner_microstep: 390292.45 | backward_inner: 390286.03 | backward_allreduce_microstep: 7.53 | backward_allreduce: 2.60 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.19 | step: 365.32 | _step_clipping: 0.12 | _step_step: 363.52 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.357 | iteration 13530/ 143000 | elapsed time per iteration (ms): 62604.6 | learning rate: 5.895E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.442541E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 19:04:06,295] [INFO] [logging.py:60:log_dist] [Rank 0] step=13540, skipped=11, lr=[0.0005894643973728675, 0.0005894643973728675], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13540 loss: 2.4381 iter time (s): 62.639 samples/sec: 16.348 %comms: 0.002894281884475659 %optimizer_step 0.05657635956928444 %forward: 23.217605592714214 %backward: 62.30639664984269 [2025-04-03 19:04:06,296] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19468.49 | forward: 145431.61 | backward_microstep: 390286.68 | backward: 390277.95 | backward_inner_microstep: 390262.07 | backward_inner: 390255.91 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.28 | comms: 18.13 | reduce_grads: 0.19 | step: 354.39 | _step_clipping: 0.13 | _step_step: 352.62 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.348 | iteration 13540/ 143000 | elapsed time per iteration (ms): 62639.1 | learning rate: 5.895E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.441352E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 19:14:34,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=13550, skipped=11, lr=[0.000589447077388102, 0.000589447077388102], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13550 loss: 2.4325 iter time (s): 62.770 samples/sec: 16.313 %comms: 0.0028550480107220055 %optimizer_step 0.056400026879467796 %forward: 23.18649280451461 %backward: 62.17015274756784 [2025-04-03 19:14:34,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20697.25 | forward: 145542.21 | backward_microstep: 390253.12 | backward: 390243.64 | backward_inner_microstep: 390225.80 | backward_inner: 390219.54 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.64 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 354.02 | _step_clipping: 0.12 | _step_step: 352.29 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.313 | iteration 13550/ 143000 | elapsed time per iteration (ms): 62770.8 | learning rate: 5.894E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.436609E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 19:25:01,908] [INFO] [logging.py:60:log_dist] [Rank 0] step=13560, skipped=11, lr=[0.0005894297434333226, 0.0005894297434333226], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13560 loss: 2.4174 iter time (s): 62.790 samples/sec: 16.308 %comms: 0.0029082665977431177 %optimizer_step 0.05602001449995991 %forward: 23.158606618731785 %backward: 62.14468209555256 [2025-04-03 19:25:01,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21053.42 | forward: 145412.49 | backward_microstep: 390213.85 | backward: 390205.39 | backward_inner_microstep: 390189.22 | backward_inner: 390182.80 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.66 | reduce_tied_grads: 0.29 | comms: 18.26 | reduce_grads: 0.38 | step: 351.75 | _step_clipping: 0.12 | _step_step: 350.03 | _step_zero_grad: 0.47 | _step_check_overflow: 0.57 samples/sec: 16.308 | iteration 13560/ 143000 | elapsed time per iteration (ms): 62790.4 | learning rate: 5.894E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.436247E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 19:35:38,876] [INFO] [logging.py:60:log_dist] [Rank 0] step=13570, skipped=11, lr=[0.0005894123955093659, 0.0005894123955093659], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13570 loss: 2.4373 iter time (s): 63.696 samples/sec: 16.076 %comms: 0.002830837021665234 %optimizer_step 0.05533519237703295 %forward: 22.855654571460555 %backward: 61.267155303645104 [2025-04-03 19:35:38,876] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29916.93 | forward: 145581.86 | backward_microstep: 390257.83 | backward: 390248.55 | backward_inner_microstep: 390231.85 | backward_inner: 390225.27 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.72 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.21 | step: 352.46 | _step_clipping: 0.13 | _step_step: 350.72 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.076 | iteration 13570/ 143000 | elapsed time per iteration (ms): 63696.8 | learning rate: 5.894E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.438238E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 19:46:00,575] [INFO] [logging.py:60:log_dist] [Rank 0] step=13580, skipped=11, lr=[0.0005893950336170692, 0.0005893950336170692], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13580 loss: 2.4340 iter time (s): 62.169 samples/sec: 16.471 %comms: 0.002882411564440905 %optimizer_step 0.05628815457173216 %forward: 23.367194517611402 %backward: 62.74634964306295 [2025-04-03 19:46:00,576] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15129.76 | forward: 145272.44 | backward_microstep: 390097.28 | backward: 390090.27 | backward_inner_microstep: 390074.21 | backward_inner: 390068.24 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.67 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.20 | step: 349.94 | _step_clipping: 0.13 | _step_step: 348.24 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.471 | iteration 13580/ 143000 | elapsed time per iteration (ms): 62169.9 | learning rate: 5.894E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.434076E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 19:56:22,903] [INFO] [logging.py:60:log_dist] [Rank 0] step=13590, skipped=11, lr=[0.0005893776577572707, 0.0005893776577572707], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13590 loss: 2.4366 iter time (s): 62.232 samples/sec: 16.454 %comms: 0.0028786583756319896 %optimizer_step 0.05998339596561042 %forward: 23.33556320585823 %backward: 62.68003366780208 [2025-04-03 19:56:22,904] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15830.11 | forward: 145222.42 | backward_microstep: 390078.69 | backward: 390071.84 | backward_inner_microstep: 390055.85 | backward_inner: 390049.96 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.67 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.20 | step: 373.29 | _step_clipping: 0.11 | _step_step: 371.58 | _step_zero_grad: 0.54 | _step_check_overflow: 0.50 samples/sec: 16.454 | iteration 13590/ 143000 | elapsed time per iteration (ms): 62232.8 | learning rate: 5.894E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.445162E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 20:06:45,441] [INFO] [logging.py:60:log_dist] [Rank 0] step=13600, skipped=11, lr=[0.0005893602679308087, 0.0005893602679308087], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13600 loss: 2.4201 iter time (s): 62.253 samples/sec: 16.449 %comms: 0.0028692607324388632 %optimizer_step 0.056364032196131744 %forward: 23.339179913089918 %backward: 62.66279287552668 [2025-04-03 20:06:45,442] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15938.41 | forward: 145293.99 | backward_microstep: 390103.33 | backward: 390096.27 | backward_inner_microstep: 390080.29 | backward_inner: 390074.43 | backward_allreduce_microstep: 7.78 | backward_allreduce: 2.69 | reduce_tied_grads: 0.24 | comms: 17.86 | reduce_grads: 0.18 | step: 350.88 | _step_clipping: 0.13 | _step_step: 349.20 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.449 | iteration 13600/ 143000 | elapsed time per iteration (ms): 62253.8 | learning rate: 5.894E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.439516E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 20:17:08,281] [INFO] [logging.py:60:log_dist] [Rank 0] step=13610, skipped=11, lr=[0.0005893428641385226, 0.0005893428641385226], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13610 loss: 2.4425 iter time (s): 62.283 samples/sec: 16.441 %comms: 0.002876026695393707 %optimizer_step 0.05706179057723476 %forward: 23.320088861385216 %backward: 62.63530155371685 [2025-04-03 20:17:08,281] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16241.06 | forward: 145245.38 | backward_microstep: 390120.71 | backward: 390113.79 | backward_inner_microstep: 390097.62 | backward_inner: 390091.70 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.70 | reduce_tied_grads: 0.26 | comms: 17.91 | reduce_grads: 0.19 | step: 355.40 | _step_clipping: 0.11 | _step_step: 353.71 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.441 | iteration 13610/ 143000 | elapsed time per iteration (ms): 62283.9 | learning rate: 5.893E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.432924E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 20:27:33,828] [INFO] [logging.py:60:log_dist] [Rank 0] step=13620, skipped=11, lr=[0.0005893254463812524, 0.0005893254463812524], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13620 loss: 2.4293 iter time (s): 62.554 samples/sec: 16.370 %comms: 0.002899060502431452 %optimizer_step 0.06255168109512432 %forward: 23.307576069649798 %backward: 62.44515195102818 [2025-04-03 20:27:33,829] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17765.66 | forward: 145798.61 | backward_microstep: 390634.44 | backward: 390620.47 | backward_inner_microstep: 390601.07 | backward_inner: 390593.69 | backward_allreduce_microstep: 9.41 | backward_allreduce: 3.42 | reduce_tied_grads: 0.36 | comms: 18.13 | reduce_grads: 0.25 | step: 391.29 | _step_clipping: 0.14 | _step_step: 389.51 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.370 | iteration 13620/ 143000 | elapsed time per iteration (ms): 62554.7 | learning rate: 5.893E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.432118E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 20:37:54,635] [INFO] [logging.py:60:log_dist] [Rank 0] step=13630, skipped=11, lr=[0.000589308014659839, 0.000589308014659839], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13630 loss: 2.4588 iter time (s): 62.080 samples/sec: 16.495 %comms: 0.002875952338723155 %optimizer_step 0.05606999078604936 %forward: 23.400598807265276 %backward: 62.83141752642961 [2025-04-03 20:37:54,636] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14258.02 | forward: 145271.43 | backward_microstep: 390066.00 | backward: 390058.82 | backward_inner_microstep: 390043.31 | backward_inner: 390037.47 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 17.85 | reduce_grads: 0.18 | step: 348.08 | _step_clipping: 0.09 | _step_step: 346.50 | _step_zero_grad: 0.48 | _step_check_overflow: 0.44 samples/sec: 16.495 | iteration 13630/ 143000 | elapsed time per iteration (ms): 62080.7 | learning rate: 5.893E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.436339E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 20:48:14,707] [INFO] [logging.py:60:log_dist] [Rank 0] step=13640, skipped=11, lr=[0.0005892905689751233, 0.0005892905689751233], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13640 loss: 2.4385 iter time (s): 62.007 samples/sec: 16.514 %comms: 0.0028804400252048362 %optimizer_step 0.05639276595419046 %forward: 23.415301021532002 %backward: 62.89226318315625 [2025-04-03 20:48:14,708] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13693.07 | forward: 145190.50 | backward_microstep: 389980.43 | backward: 389974.02 | backward_inner_microstep: 389958.74 | backward_inner: 389953.04 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.55 | reduce_tied_grads: 0.24 | comms: 17.86 | reduce_grads: 0.19 | step: 349.67 | _step_clipping: 0.10 | _step_step: 348.11 | _step_zero_grad: 0.46 | _step_check_overflow: 0.47 samples/sec: 16.514 | iteration 13640/ 143000 | elapsed time per iteration (ms): 62007.2 | learning rate: 5.893E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.452044E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 20:58:36,303] [INFO] [logging.py:60:log_dist] [Rank 0] step=13650, skipped=11, lr=[0.0005892731093279475, 0.0005892731093279475], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13650 loss: 2.4345 iter time (s): 62.159 samples/sec: 16.474 %comms: 0.0028687000046769273 %optimizer_step 0.05562365237772478 %forward: 23.384417954178385 %backward: 62.755183170091186 [2025-04-03 20:58:36,303] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14883.65 | forward: 145355.30 | backward_microstep: 390087.02 | backward: 390080.20 | backward_inner_microstep: 390064.77 | backward_inner: 390058.95 | backward_allreduce_microstep: 7.48 | backward_allreduce: 2.58 | reduce_tied_grads: 0.25 | comms: 17.83 | reduce_grads: 0.19 | step: 345.75 | _step_clipping: 0.10 | _step_step: 344.23 | _step_zero_grad: 0.46 | _step_check_overflow: 0.41 samples/sec: 16.474 | iteration 13650/ 143000 | elapsed time per iteration (ms): 62159.6 | learning rate: 5.893E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.436860E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 21:08:57,561] [INFO] [logging.py:60:log_dist] [Rank 0] step=13660, skipped=11, lr=[0.0005892556357191544, 0.0005892556357191544], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13660 loss: 2.4405 iter time (s): 62.125 samples/sec: 16.483 %comms: 0.0028773977109377664 %optimizer_step 0.05673393814920465 %forward: 23.39230626020822 %backward: 62.78710398294272 [2025-04-03 21:08:57,561] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14586.62 | forward: 145325.32 | backward_microstep: 390074.44 | backward: 390066.55 | backward_inner_microstep: 390050.79 | backward_inner: 390044.81 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.63 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.19 | step: 352.46 | _step_clipping: 0.10 | _step_step: 350.80 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.483 | iteration 13660/ 143000 | elapsed time per iteration (ms): 62125.8 | learning rate: 5.893E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.426758E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 21:19:22,856] [INFO] [logging.py:60:log_dist] [Rank 0] step=13670, skipped=11, lr=[0.0005892381481495873, 0.0005892381481495873], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13670 loss: 2.4100 iter time (s): 62.529 samples/sec: 16.376 %comms: 0.002903202945599981 %optimizer_step 0.05807183728897 %forward: 23.25317979974291 %backward: 62.408805317037384 [2025-04-03 21:19:22,857] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18336.40 | forward: 145399.74 | backward_microstep: 390246.74 | backward: 390235.83 | backward_inner_microstep: 390219.56 | backward_inner: 390213.42 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.21 | step: 363.12 | _step_clipping: 0.12 | _step_step: 361.37 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.376 | iteration 13670/ 143000 | elapsed time per iteration (ms): 62529.5 | learning rate: 5.892E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.431089E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 21:29:44,392] [INFO] [logging.py:60:log_dist] [Rank 0] step=13680, skipped=11, lr=[0.00058922064662009, 0.00058922064662009], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13680 loss: 2.4275 iter time (s): 62.153 samples/sec: 16.475 %comms: 0.002914662055719526 %optimizer_step 0.056291774880029455 %forward: 23.38990982179756 %backward: 62.79268263162175 [2025-04-03 21:29:44,393] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14541.86 | forward: 145375.48 | backward_microstep: 390289.93 | backward: 390275.82 | backward_inner_microstep: 390257.69 | backward_inner: 390251.33 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.64 | reduce_tied_grads: 0.33 | comms: 18.12 | reduce_grads: 0.20 | step: 349.87 | _step_clipping: 0.13 | _step_step: 348.14 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.475 | iteration 13680/ 143000 | elapsed time per iteration (ms): 62153.6 | learning rate: 5.892E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.430623E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 21:40:06,241] [INFO] [logging.py:60:log_dist] [Rank 0] step=13690, skipped=11, lr=[0.0005892031311315076, 0.0005892031311315076], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13690 loss: 2.4150 iter time (s): 62.184 samples/sec: 16.467 %comms: 0.002901117467146482 %optimizer_step 0.05643692758200382 %forward: 23.375143590160505 %backward: 62.769540213070066 [2025-04-03 21:40:06,242] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14842.84 | forward: 145356.88 | backward_microstep: 390342.62 | backward: 390328.48 | backward_inner_microstep: 390311.71 | backward_inner: 390305.28 | backward_allreduce_microstep: 7.87 | backward_allreduce: 2.85 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.19 | step: 350.95 | _step_clipping: 0.12 | _step_step: 349.18 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.467 | iteration 13690/ 143000 | elapsed time per iteration (ms): 62184.9 | learning rate: 5.892E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.428801E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 21:50:35,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=13700, skipped=11, lr=[0.0005891856016846851, 0.0005891856016846851], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13700 loss: 2.4286 iter time (s): 62.972 samples/sec: 16.261 %comms: 0.0028538086760177897 %optimizer_step 0.05580075093400189 %forward: 23.081911094066623 %backward: 61.98973136743721 [2025-04-03 21:50:35,969] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22688.93 | forward: 145351.69 | backward_microstep: 390374.58 | backward: 390362.49 | backward_inner_microstep: 390342.85 | backward_inner: 390336.56 | backward_allreduce_microstep: 10.97 | backward_allreduce: 4.36 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.19 | step: 351.39 | _step_clipping: 0.11 | _step_step: 349.74 | _step_zero_grad: 0.49 | _step_check_overflow: 0.46 samples/sec: 16.261 | iteration 13700/ 143000 | elapsed time per iteration (ms): 62972.7 | learning rate: 5.892E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.436480E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 22:00:56,370] [INFO] [logging.py:60:log_dist] [Rank 0] step=13710, skipped=11, lr=[0.0005891680582804688, 0.0005891680582804688], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13710 loss: 2.4554 iter time (s): 62.040 samples/sec: 16.506 %comms: 0.0028850627992795096 %optimizer_step 0.0562424110090547 %forward: 23.39421493746754 %backward: 62.85869959016879 [2025-04-03 22:00:56,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14048.36 | forward: 145136.64 | backward_microstep: 389979.22 | backward: 389972.50 | backward_inner_microstep: 389956.77 | backward_inner: 389950.97 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.19 | step: 348.93 | _step_clipping: 0.12 | _step_step: 347.28 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.505 | iteration 13710/ 143000 | elapsed time per iteration (ms): 62040.1 | learning rate: 5.892E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.439574E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 22:11:17,357] [INFO] [logging.py:60:log_dist] [Rank 0] step=13720, skipped=11, lr=[0.0005891505009197054, 0.0005891505009197054], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13720 loss: 2.4387 iter time (s): 62.098 samples/sec: 16.490 %comms: 0.002913707500076034 %optimizer_step 0.05651202693624872 %forward: 23.38124079821157 %backward: 62.79820193800941 [2025-04-03 22:11:17,357] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14567.27 | forward: 145193.19 | backward_microstep: 389972.38 | backward: 389965.25 | backward_inner_microstep: 389949.43 | backward_inner: 389943.54 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 18.09 | reduce_grads: 0.19 | step: 350.93 | _step_clipping: 0.11 | _step_step: 349.24 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.490 | iteration 13720/ 143000 | elapsed time per iteration (ms): 62098.7 | learning rate: 5.892E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.434823E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 22:21:36,029] [INFO] [logging.py:60:log_dist] [Rank 0] step=13730, skipped=11, lr=[0.0005891329296032422, 0.0005891329296032422], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13730 loss: 2.4295 iter time (s): 61.867 samples/sec: 16.552 %comms: 0.0028865306886578608 %optimizer_step 0.05585499698566185 %forward: 23.45032071638523 %backward: 63.02776226655797 [2025-04-03 22:21:36,030] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12419.71 | forward: 145079.52 | backward_microstep: 389938.48 | backward: 389932.31 | backward_inner_microstep: 389917.54 | backward_inner: 389911.96 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.86 | reduce_grads: 0.18 | step: 345.56 | _step_clipping: 0.10 | _step_step: 343.84 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.552 | iteration 13730/ 143000 | elapsed time per iteration (ms): 61867.3 | learning rate: 5.891E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.434139E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 22:31:54,645] [INFO] [logging.py:60:log_dist] [Rank 0] step=13740, skipped=11, lr=[0.0005891153443319272, 0.0005891153443319272], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13740 loss: 2.4374 iter time (s): 61.861 samples/sec: 16.553 %comms: 0.0028816768178517127 %optimizer_step 0.055471690992418735 %forward: 23.457124711756634 %backward: 63.03857676502167 [2025-04-03 22:31:54,646] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12279.60 | forward: 145107.94 | backward_microstep: 389969.37 | backward: 389962.45 | backward_inner_microstep: 389947.22 | backward_inner: 389941.52 | backward_allreduce_microstep: 7.37 | backward_allreduce: 2.54 | reduce_tied_grads: 0.26 | comms: 17.83 | reduce_grads: 0.18 | step: 343.15 | _step_clipping: 0.10 | _step_step: 341.42 | _step_zero_grad: 0.45 | _step_check_overflow: 0.66 samples/sec: 16.553 | iteration 13740/ 143000 | elapsed time per iteration (ms): 61861.5 | learning rate: 5.891E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.425966E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 22:42:21,261] [INFO] [logging.py:60:log_dist] [Rank 0] step=13750, skipped=11, lr=[0.0005890977451066093, 0.0005890977451066093], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13750 loss: 2.4384 iter time (s): 62.661 samples/sec: 16.342 %comms: 0.0028896293199755247 %optimizer_step 0.055656935545591125 %forward: 23.19724255768549 %backward: 62.27469540311562 [2025-04-03 22:42:21,261] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19685.25 | forward: 145356.18 | backward_microstep: 390230.29 | backward: 390219.31 | backward_inner_microstep: 390202.92 | backward_inner: 390196.71 | backward_allreduce_microstep: 7.82 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.21 | step: 348.75 | _step_clipping: 0.13 | _step_step: 347.01 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.342 | iteration 13750/ 143000 | elapsed time per iteration (ms): 62661.5 | learning rate: 5.891E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.431454E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 22:52:47,421] [INFO] [logging.py:60:log_dist] [Rank 0] step=13760, skipped=11, lr=[0.000589080131928138, 0.000589080131928138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13760 loss: 2.4275 iter time (s): 62.615 samples/sec: 16.354 %comms: 0.002925387971226627 %optimizer_step 0.05658949782276204 %forward: 23.20113392422667 %backward: 62.330660453836984 [2025-04-03 22:52:47,422] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19293.49 | forward: 145275.04 | backward_microstep: 390298.86 | backward: 390286.50 | backward_inner_microstep: 390270.39 | backward_inner: 390263.88 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.65 | reduce_tied_grads: 0.42 | comms: 18.32 | reduce_grads: 0.21 | step: 354.34 | _step_clipping: 0.12 | _step_step: 352.60 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.354 | iteration 13760/ 143000 | elapsed time per iteration (ms): 62616.1 | learning rate: 5.891E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.440015E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 23:03:13,763] [INFO] [logging.py:60:log_dist] [Rank 0] step=13770, skipped=11, lr=[0.0005890625047973631, 0.0005890625047973631], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13770 loss: 2.4321 iter time (s): 62.634 samples/sec: 16.349 %comms: 0.0028850652778641306 %optimizer_step 0.05671913785753263 %forward: 23.207093308768883 %backward: 62.297655153831286 [2025-04-03 23:03:13,764] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19495.96 | forward: 145354.53 | backward_microstep: 390201.42 | backward: 390193.04 | backward_inner_microstep: 390177.20 | backward_inner: 390170.90 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.58 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.19 | step: 355.25 | _step_clipping: 0.13 | _step_step: 353.35 | _step_zero_grad: 0.48 | _step_check_overflow: 0.72 samples/sec: 16.349 | iteration 13770/ 143000 | elapsed time per iteration (ms): 62634.2 | learning rate: 5.891E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.425094E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 23:13:38,093] [INFO] [logging.py:60:log_dist] [Rank 0] step=13780, skipped=11, lr=[0.0005890448637151355, 0.0005890448637151355], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13780 loss: 2.4323 iter time (s): 62.432 samples/sec: 16.402 %comms: 0.0028825294268201743 %optimizer_step 0.057354591006034884 %forward: 23.308721617946375 %backward: 62.49867827899309 [2025-04-03 23:13:38,094] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17296.88 | forward: 145521.85 | backward_microstep: 390202.87 | backward: 390193.99 | backward_inner_microstep: 390173.92 | backward_inner: 390167.70 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.20 | step: 358.08 | _step_clipping: 0.10 | _step_step: 356.37 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.402 | iteration 13780/ 143000 | elapsed time per iteration (ms): 62433.0 | learning rate: 5.890E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.429430E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 23:24:02,567] [INFO] [logging.py:60:log_dist] [Rank 0] step=13790, skipped=11, lr=[0.0005890272086823068, 0.0005890272086823068], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13790 loss: 2.4425 iter time (s): 62.447 samples/sec: 16.398 %comms: 0.0028838496486739687 %optimizer_step 0.05717707072257598 %forward: 23.293507894271347 %backward: 62.49374302098477 [2025-04-03 23:24:02,567] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17427.77 | forward: 145460.43 | backward_microstep: 390261.64 | backward: 390253.23 | backward_inner_microstep: 390237.06 | backward_inner: 390230.85 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.19 | step: 357.05 | _step_clipping: 0.11 | _step_step: 355.31 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.398 | iteration 13790/ 143000 | elapsed time per iteration (ms): 62447.3 | learning rate: 5.890E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.431441E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 23:34:29,817] [INFO] [logging.py:60:log_dist] [Rank 0] step=13800, skipped=11, lr=[0.0005890095396997288, 0.0005890095396997288], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13800 loss: 2.4488 iter time (s): 62.725 samples/sec: 16.325 %comms: 0.0028783386229316397 %optimizer_step 0.05601104421171343 %forward: 23.18206488271064 %backward: 62.20229619673652 [2025-04-03 23:34:29,818] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20376.31 | forward: 145408.44 | backward_microstep: 390169.78 | backward: 390161.05 | backward_inner_microstep: 390144.83 | backward_inner: 390138.74 | backward_allreduce_microstep: 7.44 | backward_allreduce: 2.55 | reduce_tied_grads: 0.27 | comms: 18.05 | reduce_grads: 0.18 | step: 351.33 | _step_clipping: 0.11 | _step_step: 349.59 | _step_zero_grad: 0.47 | _step_check_overflow: 0.61 samples/sec: 16.325 | iteration 13800/ 143000 | elapsed time per iteration (ms): 62725.1 | learning rate: 5.890E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.442491E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 23:44:54,621] [INFO] [logging.py:60:log_dist] [Rank 0] step=13810, skipped=11, lr=[0.0005889918567682546, 0.0005889918567682546], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13810 loss: 2.4505 iter time (s): 62.480 samples/sec: 16.389 %comms: 0.0028578232574535734 %optimizer_step 0.055254951092267426 %forward: 23.239463810340474 %backward: 62.418690674522004 [2025-04-03 23:44:54,622] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18313.55 | forward: 145199.88 | backward_microstep: 389998.89 | backward: 389991.20 | backward_inner_microstep: 389974.13 | backward_inner: 389968.26 | backward_allreduce_microstep: 7.38 | backward_allreduce: 2.54 | reduce_tied_grads: 0.25 | comms: 17.86 | reduce_grads: 0.18 | step: 345.23 | _step_clipping: 0.12 | _step_step: 343.62 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.389 | iteration 13810/ 143000 | elapsed time per iteration (ms): 62480.4 | learning rate: 5.890E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.435480E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-03 23:55:14,198] [INFO] [logging.py:60:log_dist] [Rank 0] step=13820, skipped=11, lr=[0.0005889741598887375, 0.0005889741598887375], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13820 loss: 2.4268 iter time (s): 61.957 samples/sec: 16.528 %comms: 0.002891017733044386 %optimizer_step 0.05578188854475845 %forward: 23.419142070585227 %backward: 62.93673006151983 [2025-04-03 23:55:14,198] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13233.70 | forward: 145098.24 | backward_microstep: 389945.58 | backward: 389937.80 | backward_inner_microstep: 389922.99 | backward_inner: 389917.47 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.25 | comms: 17.91 | reduce_grads: 0.18 | step: 345.61 | _step_clipping: 0.10 | _step_step: 343.84 | _step_zero_grad: 0.45 | _step_check_overflow: 0.49 samples/sec: 16.527 | iteration 13820/ 143000 | elapsed time per iteration (ms): 61957.6 | learning rate: 5.890E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.429217E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 00:05:37,977] [INFO] [logging.py:60:log_dist] [Rank 0] step=13830, skipped=11, lr=[0.0005889564490620317, 0.0005889564490620317], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13830 loss: 2.4355 iter time (s): 62.377 samples/sec: 16.416 %comms: 0.0028715367589413556 %optimizer_step 0.06144671280716766 %forward: 23.316107928601834 %backward: 62.54941533479328 [2025-04-04 00:05:37,978] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16812.34 | forward: 145439.91 | backward_microstep: 390177.40 | backward: 390167.24 | backward_inner_microstep: 390151.28 | backward_inner: 390136.38 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.63 | reduce_tied_grads: 0.27 | comms: 17.91 | reduce_grads: 0.20 | step: 383.29 | _step_clipping: 0.10 | _step_step: 381.67 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.416 | iteration 13830/ 143000 | elapsed time per iteration (ms): 62378.0 | learning rate: 5.890E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.431114E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 00:16:05,455] [INFO] [logging.py:60:log_dist] [Rank 0] step=13840, skipped=11, lr=[0.0005889387242889919, 0.0005889387242889919], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13840 loss: 2.4447 iter time (s): 62.747 samples/sec: 16.319 %comms: 0.0028595920850918457 %optimizer_step 0.05602306384752812 %forward: 23.184558798528435 %backward: 62.175388845239254 [2025-04-04 00:16:05,456] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20540.52 | forward: 145476.65 | backward_microstep: 390143.06 | backward: 390133.25 | backward_inner_microstep: 390117.23 | backward_inner: 390104.83 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 351.53 | _step_clipping: 0.12 | _step_step: 349.75 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.319 | iteration 13840/ 143000 | elapsed time per iteration (ms): 62747.8 | learning rate: 5.889E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.433798E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 00:26:31,659] [INFO] [logging.py:60:log_dist] [Rank 0] step=13850, skipped=11, lr=[0.0005889209855704735, 0.0005889209855704735], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13850 loss: 2.4234 iter time (s): 62.620 samples/sec: 16.353 %comms: 0.002872529974645257 %optimizer_step 0.05685424664460478 %forward: 23.218841359898544 %backward: 62.32304955406044 [2025-04-04 00:26:31,659] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19112.92 | forward: 145395.94 | backward_microstep: 390279.60 | backward: 390265.75 | backward_inner_microstep: 390249.25 | backward_inner: 390242.70 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.63 | reduce_tied_grads: 0.34 | comms: 17.99 | reduce_grads: 0.20 | step: 356.02 | _step_clipping: 0.12 | _step_step: 354.17 | _step_zero_grad: 0.49 | _step_check_overflow: 0.67 samples/sec: 16.353 | iteration 13850/ 143000 | elapsed time per iteration (ms): 62620.4 | learning rate: 5.889E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.435652E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 00:36:53,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=13860, skipped=11, lr=[0.0005889032329073328, 0.0005889032329073328], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13860 loss: 2.4125 iter time (s): 62.134 samples/sec: 16.481 %comms: 0.0029083092296009376 %optimizer_step 0.05814028366805506 %forward: 23.38468619098217 %backward: 62.81159016817135 [2025-04-04 00:36:53,005] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14343.98 | forward: 145298.17 | backward_microstep: 390286.74 | backward: 390272.90 | backward_inner_microstep: 390255.51 | backward_inner: 390249.01 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.72 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.23 | step: 361.25 | _step_clipping: 0.12 | _step_step: 359.42 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.480 | iteration 13860/ 143000 | elapsed time per iteration (ms): 62134.5 | learning rate: 5.889E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.426364E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 00:47:14,587] [INFO] [logging.py:60:log_dist] [Rank 0] step=13870, skipped=11, lr=[0.0005888854663004267, 0.0005888854663004267], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13870 loss: 2.4357 iter time (s): 62.158 samples/sec: 16.474 %comms: 0.0029023968480151275 %optimizer_step 0.05766033955916611 %forward: 23.3780826225435 %backward: 62.80590356785935 [2025-04-04 00:47:14,588] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14453.00 | forward: 145312.99 | backward_microstep: 390401.93 | backward: 390387.61 | backward_inner_microstep: 390370.45 | backward_inner: 390363.66 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.90 | reduce_tied_grads: 0.33 | comms: 18.04 | reduce_grads: 0.20 | step: 358.40 | _step_clipping: 0.12 | _step_step: 356.60 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.474 | iteration 13870/ 143000 | elapsed time per iteration (ms): 62158.3 | learning rate: 5.889E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.426887E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 00:57:35,926] [INFO] [logging.py:60:log_dist] [Rank 0] step=13880, skipped=11, lr=[0.0005888676857506125, 0.0005888676857506125], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13880 loss: 2.4236 iter time (s): 62.133 samples/sec: 16.481 %comms: 0.0029011975147355236 %optimizer_step 0.05786854338880654 %forward: 23.386606158119367 %backward: 62.82229902488644 [2025-04-04 00:57:35,927] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14276.06 | forward: 145308.83 | backward_microstep: 390350.14 | backward: 390336.02 | backward_inner_microstep: 390319.18 | backward_inner: 390311.12 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.20 | step: 359.56 | _step_clipping: 0.12 | _step_step: 357.85 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.481 | iteration 13880/ 143000 | elapsed time per iteration (ms): 62133.9 | learning rate: 5.889E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.424673E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 01:07:57,843] [INFO] [logging.py:60:log_dist] [Rank 0] step=13890, skipped=11, lr=[0.0005888498912587486, 0.0005888498912587486], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13890 loss: 2.4361 iter time (s): 62.191 samples/sec: 16.465 %comms: 0.0029026047580770426 %optimizer_step 0.05780018645423012 %forward: 23.375927609679852 %backward: 62.78450339289229 [2025-04-04 01:07:57,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14628.72 | forward: 145377.51 | backward_microstep: 390478.36 | backward: 390463.86 | backward_inner_microstep: 390446.89 | backward_inner: 390440.22 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.21 | step: 359.47 | _step_clipping: 0.12 | _step_step: 357.63 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.465 | iteration 13890/ 143000 | elapsed time per iteration (ms): 62191.7 | learning rate: 5.888E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.422460E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 01:18:19,789] [INFO] [logging.py:60:log_dist] [Rank 0] step=13900, skipped=11, lr=[0.0005888320828256935, 0.0005888320828256935], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13900 loss: 2.4071 iter time (s): 62.194 samples/sec: 16.465 %comms: 0.002895877957817112 %optimizer_step 0.0571614843772521 %forward: 23.36996620100012 %backward: 62.78357669844402 [2025-04-04 01:18:19,790] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14664.49 | forward: 145347.11 | backward_microstep: 390490.29 | backward: 390476.02 | backward_inner_microstep: 390459.09 | backward_inner: 390452.55 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.19 | step: 355.51 | _step_clipping: 0.10 | _step_step: 353.79 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.464 | iteration 13900/ 143000 | elapsed time per iteration (ms): 62194.6 | learning rate: 5.888E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.421598E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 01:28:41,806] [INFO] [logging.py:60:log_dist] [Rank 0] step=13910, skipped=11, lr=[0.0005888142604523071, 0.0005888142604523071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13910 loss: 2.4336 iter time (s): 62.201 samples/sec: 16.463 %comms: 0.0029089611147007136 %optimizer_step 0.05794356777267895 %forward: 23.370074488611834 %backward: 62.77733562252816 [2025-04-04 01:28:41,807] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14748.17 | forward: 145364.47 | backward_microstep: 390495.64 | backward: 390482.03 | backward_inner_microstep: 390464.86 | backward_inner: 390458.29 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.74 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.21 | step: 360.42 | _step_clipping: 0.11 | _step_step: 358.51 | _step_zero_grad: 0.59 | _step_check_overflow: 0.52 samples/sec: 16.463 | iteration 13910/ 143000 | elapsed time per iteration (ms): 62201.7 | learning rate: 5.888E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.435292E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 01:39:08,250] [INFO] [logging.py:60:log_dist] [Rank 0] step=13920, skipped=11, lr=[0.0005887964241394493, 0.0005887964241394493], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13920 loss: 2.4500 iter time (s): 62.644 samples/sec: 16.346 %comms: 0.0028732546574546853 %optimizer_step 0.05658656853444849 %forward: 23.20342751520124 %backward: 62.307299433618304 [2025-04-04 01:39:08,251] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19346.68 | forward: 145355.19 | backward_microstep: 390331.84 | backward: 390316.89 | backward_inner_microstep: 390298.87 | backward_inner: 390292.53 | backward_allreduce_microstep: 7.59 | backward_allreduce: 2.60 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.21 | step: 354.48 | _step_clipping: 0.11 | _step_step: 352.83 | _step_zero_grad: 0.48 | _step_check_overflow: 0.46 samples/sec: 16.346 | iteration 13920/ 143000 | elapsed time per iteration (ms): 62644.4 | learning rate: 5.888E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.434538E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 01:49:32,392] [INFO] [logging.py:60:log_dist] [Rank 0] step=13930, skipped=11, lr=[0.0005887785738879811, 0.0005887785738879811], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13930 loss: 2.4249 iter time (s): 62.414 samples/sec: 16.407 %comms: 0.002880413841651851 %optimizer_step 0.055279161762839195 %forward: 23.30443428254818 %backward: 62.55285049337288 [2025-04-04 01:49:32,395] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16833.97 | forward: 145451.48 | backward_microstep: 390431.10 | backward: 390415.18 | backward_inner_microstep: 390398.79 | backward_inner: 390392.35 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.66 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.18 | step: 345.02 | _step_clipping: 0.11 | _step_step: 343.32 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.406 | iteration 13930/ 143000 | elapsed time per iteration (ms): 62414.4 | learning rate: 5.888E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.428789E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 01:59:55,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=13940, skipped=11, lr=[0.000588760709698764, 0.000588760709698764], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13940 loss: 2.4138 iter time (s): 62.358 samples/sec: 16.421 %comms: 0.002899112500363388 %optimizer_step 0.05745718588721012 %forward: 23.32401370965243 %backward: 62.62140252005581 [2025-04-04 01:59:55,984] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16186.89 | forward: 145444.22 | backward_microstep: 390510.48 | backward: 390495.44 | backward_inner_microstep: 390478.31 | backward_inner: 390471.49 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.76 | reduce_tied_grads: 0.36 | comms: 18.08 | reduce_grads: 0.21 | step: 358.29 | _step_clipping: 0.13 | _step_step: 356.51 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.421 | iteration 13940/ 143000 | elapsed time per iteration (ms): 62358.9 | learning rate: 5.888E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.428872E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 02:10:30,806] [INFO] [logging.py:60:log_dist] [Rank 0] step=13950, skipped=11, lr=[0.0005887428315726603, 0.0005887428315726603], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13950 loss: 2.4125 iter time (s): 63.482 samples/sec: 16.131 %comms: 0.002836568295436886 %optimizer_step 0.05508020952041848 %forward: 22.9209240064793 %backward: 61.500594691780464 [2025-04-04 02:10:30,807] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27472.53 | forward: 145506.10 | backward_microstep: 390432.44 | backward: 390416.71 | backward_inner_microstep: 390400.25 | backward_inner: 390393.88 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.69 | reduce_tied_grads: 0.28 | comms: 18.01 | reduce_grads: 0.19 | step: 349.66 | _step_clipping: 0.11 | _step_step: 348.01 | _step_zero_grad: 0.50 | _step_check_overflow: 0.46 samples/sec: 16.130 | iteration 13950/ 143000 | elapsed time per iteration (ms): 63482.3 | learning rate: 5.887E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.431607E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 02:20:56,911] [INFO] [logging.py:60:log_dist] [Rank 0] step=13960, skipped=11, lr=[0.0005887249395105327, 0.0005887249395105327], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13960 loss: 2.4401 iter time (s): 62.610 samples/sec: 16.355 %comms: 0.002856949521886556 %optimizer_step 0.055322537933319114 %forward: 23.21208917339099 %backward: 62.31403075978942 [2025-04-04 02:20:56,912] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19252.39 | forward: 145330.83 | backward_microstep: 390156.77 | backward: 390147.99 | backward_inner_microstep: 390132.13 | backward_inner: 390126.01 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.61 | reduce_tied_grads: 0.26 | comms: 17.89 | reduce_grads: 0.18 | step: 346.37 | _step_clipping: 0.12 | _step_step: 344.70 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.355 | iteration 13960/ 143000 | elapsed time per iteration (ms): 62610.5 | learning rate: 5.887E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.433175E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 02:31:24,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=13970, skipped=11, lr=[0.0005887070335132449, 0.0005887070335132449], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13970 loss: 2.4345 iter time (s): 62.710 samples/sec: 16.329 %comms: 0.0028450354364882766 %optimizer_step 0.05608758618857038 %forward: 23.172677216315787 %backward: 62.21346911054548 [2025-04-04 02:31:24,013] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20253.31 | forward: 145314.91 | backward_microstep: 390145.98 | backward: 390138.12 | backward_inner_microstep: 390122.53 | backward_inner: 390116.62 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.58 | reduce_tied_grads: 0.26 | comms: 17.84 | reduce_grads: 0.18 | step: 351.72 | _step_clipping: 0.11 | _step_step: 350.09 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.329 | iteration 13970/ 143000 | elapsed time per iteration (ms): 62710.1 | learning rate: 5.887E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.429524E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 02:41:51,172] [INFO] [logging.py:60:log_dist] [Rank 0] step=13980, skipped=11, lr=[0.0005886891135816609, 0.0005886891135816609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13980 loss: 2.4340 iter time (s): 62.715 samples/sec: 16.328 %comms: 0.002870089426659741 %optimizer_step 0.05868022658145026 %forward: 23.190735344487873 %backward: 62.20510438445713 [2025-04-04 02:41:51,173] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20189.85 | forward: 145441.68 | backward_microstep: 390129.37 | backward: 390121.95 | backward_inner_microstep: 390105.81 | backward_inner: 390099.81 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.67 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.21 | step: 368.02 | _step_clipping: 0.12 | _step_step: 366.25 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.328 | iteration 13980/ 143000 | elapsed time per iteration (ms): 62716.0 | learning rate: 5.887E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.434095E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 02:52:17,305] [INFO] [logging.py:60:log_dist] [Rank 0] step=13990, skipped=11, lr=[0.0005886711797166459, 0.0005886711797166459], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 13990 loss: 2.4450 iter time (s): 62.613 samples/sec: 16.355 %comms: 0.0028958953789243844 %optimizer_step 0.057028283461279354 %forward: 23.221379648752233 %backward: 62.33219765907997 [2025-04-04 02:52:17,306] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19013.67 | forward: 145395.24 | backward_microstep: 390288.15 | backward: 390278.47 | backward_inner_microstep: 390260.65 | backward_inner: 390254.64 | backward_allreduce_microstep: 9.36 | backward_allreduce: 4.47 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.20 | step: 357.07 | _step_clipping: 0.13 | _step_step: 355.23 | _step_zero_grad: 0.50 | _step_check_overflow: 0.61 samples/sec: 16.354 | iteration 13990/ 143000 | elapsed time per iteration (ms): 62613.3 | learning rate: 5.887E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.432001E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 03:02:39,122] [INFO] [logging.py:60:log_dist] [Rank 0] step=14000, skipped=11, lr=[0.0005886532319190652, 0.0005886532319190652], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14000 loss: 2.4388 iter time (s): 62.181 samples/sec: 16.468 %comms: 0.003257663734839089 %optimizer_step 0.05839447173551644 %forward: 23.3803007448995 %backward: 62.79946384074119 [2025-04-04 03:02:39,123] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14428.39 | forward: 145381.34 | backward_microstep: 390508.40 | backward: 390494.13 | backward_inner_microstep: 390476.82 | backward_inner: 390470.23 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 20.26 | reduce_grads: 0.21 | step: 363.10 | _step_clipping: 0.11 | _step_step: 361.25 | _step_zero_grad: 0.56 | _step_check_overflow: 0.56 samples/sec: 16.468 | iteration 14000/ 143000 | elapsed time per iteration (ms): 62181.8 | learning rate: 5.887E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.433223E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 03:02:42,001] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step14000/mp_rank_00_model_states.pt [2025-04-04 03:02:56,124] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-04 03:02:56,129] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step14000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-04 03:13:24,965] [INFO] [logging.py:60:log_dist] [Rank 0] step=14010, skipped=11, lr=[0.0005886352701897853, 0.0005886352701897853], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14010 loss: 2.4210 iter time (s): 62.882 samples/sec: 16.284 %comms: 0.0028479979112321133 %optimizer_step 0.055541324265424255 %forward: 23.1741492594973 %backward: 62.11572723245426 [2025-04-04 03:13:24,965] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21006.34 | forward: 145723.94 | backward_microstep: 390610.30 | backward: 390596.79 | backward_inner_microstep: 390580.12 | backward_inner: 390573.73 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.69 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 349.26 | _step_clipping: 0.12 | _step_step: 347.63 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 15.855 | iteration 14010/ 143000 | elapsed time per iteration (ms): 64584.2 | learning rate: 5.886E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.425325E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 03:14:27,132] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-04 03:23:50,582] [INFO] [logging.py:60:log_dist] [Rank 0] step=14020, skipped=12, lr=[0.000588619092722547, 0.000588619092722547], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14020 loss: 2.4773 iter time (s): 62.561 samples/sec: 16.368 %comms: 0.002583758192723926 %optimizer_step 0.05148118572531594 %forward: 23.232409959112914 %backward: 62.37929977204052 [2025-04-04 03:23:50,583] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18669.06 | forward: 145344.76 | backward_microstep: 390261.87 | backward: 390252.42 | backward_inner_microstep: 390236.41 | backward_inner: 390229.99 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 16.16 | reduce_grads: 0.19 | step: 322.07 | _step_clipping: 0.12 | _step_step: 320.38 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.368 | iteration 14020/ 143000 | elapsed time per iteration (ms): 62561.8 | learning rate: 5.886E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.459534E+00 | loss scale: 262144.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-04 03:34:16,033] [INFO] [logging.py:60:log_dist] [Rank 0] step=14030, skipped=12, lr=[0.0005886011045254272, 0.0005886011045254272], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14030 loss: 2.4295 iter time (s): 62.545 samples/sec: 16.372 %comms: 0.0029050337380740737 %optimizer_step 0.05597698256929897 %forward: 23.239995175192128 %backward: 62.41274425370656 [2025-04-04 03:34:16,034] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18329.64 | forward: 145353.51 | backward_microstep: 390368.35 | backward: 390357.71 | backward_inner_microstep: 390340.47 | backward_inner: 390332.27 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.23 | step: 350.11 | _step_clipping: 0.12 | _step_step: 348.36 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.372 | iteration 14030/ 143000 | elapsed time per iteration (ms): 62545.1 | learning rate: 5.886E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.437547E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 03:44:38,271] [INFO] [logging.py:60:log_dist] [Rank 0] step=14040, skipped=12, lr=[0.000588583102399124, 0.000588583102399124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14040 loss: 2.4427 iter time (s): 62.223 samples/sec: 16.457 %comms: 0.002874745781744789 %optimizer_step 0.05808476230890451 %forward: 23.356190745135482 %backward: 62.7047759258717 [2025-04-04 03:44:38,271] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15354.28 | forward: 145329.73 | backward_microstep: 390176.73 | backward: 390169.27 | backward_inner_microstep: 390150.87 | backward_inner: 390144.82 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.79 | reduce_tied_grads: 0.29 | comms: 17.89 | reduce_grads: 0.19 | step: 361.42 | _step_clipping: 0.12 | _step_step: 359.69 | _step_zero_grad: 0.47 | _step_check_overflow: 0.60 samples/sec: 16.457 | iteration 14040/ 143000 | elapsed time per iteration (ms): 62223.8 | learning rate: 5.886E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.436886E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 03:55:05,135] [INFO] [logging.py:60:log_dist] [Rank 0] step=14050, skipped=12, lr=[0.0005885650863445062, 0.0005885650863445062], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14050 loss: 2.4218 iter time (s): 62.686 samples/sec: 16.335 %comms: 0.0028604877200518056 %optimizer_step 0.056311927370766554 %forward: 23.19088542764191 %backward: 62.252933493407205 [2025-04-04 03:55:05,136] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19882.02 | forward: 145374.14 | backward_microstep: 390245.78 | backward: 390238.08 | backward_inner_microstep: 390222.01 | backward_inner: 390214.30 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.65 | reduce_tied_grads: 0.27 | comms: 17.93 | reduce_grads: 0.18 | step: 353.00 | _step_clipping: 0.13 | _step_step: 351.28 | _step_zero_grad: 0.47 | _step_check_overflow: 0.55 samples/sec: 16.335 | iteration 14050/ 143000 | elapsed time per iteration (ms): 62686.4 | learning rate: 5.886E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.435931E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 04:05:24,327] [INFO] [logging.py:60:log_dist] [Rank 0] step=14060, skipped=12, lr=[0.0005885470563624434, 0.0005885470563624434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14060 loss: 2.4382 iter time (s): 61.919 samples/sec: 16.538 %comms: 0.0028993614691778372 %optimizer_step 0.057256208938228303 %forward: 23.446976481841265 %backward: 63.00011970560868 [2025-04-04 04:05:24,328] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12623.58 | forward: 145180.41 | backward_microstep: 390094.40 | backward: 390087.96 | backward_inner_microstep: 390072.89 | backward_inner: 390067.02 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.51 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.20 | step: 354.52 | _step_clipping: 0.13 | _step_step: 352.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.538 | iteration 14060/ 143000 | elapsed time per iteration (ms): 61919.2 | learning rate: 5.885E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.438167E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 04:15:43,443] [INFO] [logging.py:60:log_dist] [Rank 0] step=14070, skipped=12, lr=[0.0005885290124538057, 0.0005885290124538057], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14070 loss: 2.4394 iter time (s): 61.911 samples/sec: 16.540 %comms: 0.0028969461463607257 %optimizer_step 0.05679567936452329 %forward: 23.44219658322489 %backward: 63.00657867140694 [2025-04-04 04:15:43,443] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12595.02 | forward: 145132.92 | backward_microstep: 390086.07 | backward: 390079.87 | backward_inner_microstep: 390064.94 | backward_inner: 390059.18 | backward_allreduce_microstep: 7.18 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.94 | reduce_grads: 0.18 | step: 351.63 | _step_clipping: 0.11 | _step_step: 350.05 | _step_zero_grad: 0.45 | _step_check_overflow: 0.48 samples/sec: 16.540 | iteration 14070/ 143000 | elapsed time per iteration (ms): 61911.5 | learning rate: 5.885E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.432085E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 04:26:09,470] [INFO] [logging.py:60:log_dist] [Rank 0] step=14080, skipped=12, lr=[0.0005885109546194642, 0.0005885109546194642], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14080 loss: 2.4141 iter time (s): 62.602 samples/sec: 16.357 %comms: 0.002903544957457986 %optimizer_step 0.05728124947972631 %forward: 23.202436573853436 %backward: 62.32096992637667 [2025-04-04 04:26:09,470] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19288.38 | forward: 145252.07 | backward_microstep: 390149.54 | backward: 390142.21 | backward_inner_microstep: 390126.40 | backward_inner: 390120.40 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.32 | comms: 18.18 | reduce_grads: 0.21 | step: 358.59 | _step_clipping: 0.12 | _step_step: 356.65 | _step_zero_grad: 0.54 | _step_check_overflow: 0.69 samples/sec: 16.357 | iteration 14080/ 143000 | elapsed time per iteration (ms): 62602.7 | learning rate: 5.885E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.422484E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 04:36:34,782] [INFO] [logging.py:60:log_dist] [Rank 0] step=14090, skipped=12, lr=[0.0005884928828602902, 0.0005884928828602902], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14090 loss: 2.4329 iter time (s): 62.531 samples/sec: 16.376 %comms: 0.0028559991632946197 %optimizer_step 0.05560385015422766 %forward: 23.21613485847397 %backward: 62.377586844834035 [2025-04-04 04:36:34,782] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18811.22 | forward: 145171.96 | backward_microstep: 390057.91 | backward: 390051.00 | backward_inner_microstep: 390035.67 | backward_inner: 390029.96 | backward_allreduce_microstep: 7.42 | backward_allreduce: 2.57 | reduce_tied_grads: 0.25 | comms: 17.86 | reduce_grads: 0.18 | step: 347.69 | _step_clipping: 0.11 | _step_step: 346.09 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.376 | iteration 14090/ 143000 | elapsed time per iteration (ms): 62531.2 | learning rate: 5.885E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.432510E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 04:47:00,110] [INFO] [logging.py:60:log_dist] [Rank 0] step=14100, skipped=12, lr=[0.000588474797177156, 0.000588474797177156], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14100 loss: 2.4335 iter time (s): 62.532 samples/sec: 16.376 %comms: 0.00287834310350405 %optimizer_step 0.056663817332072264 %forward: 23.227746916399138 %backward: 62.40887303964777 [2025-04-04 04:47:00,111] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18489.31 | forward: 145248.38 | backward_microstep: 390266.83 | backward: 390256.87 | backward_inner_microstep: 390239.15 | backward_inner: 390232.91 | backward_allreduce_microstep: 7.67 | backward_allreduce: 2.65 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.20 | step: 354.33 | _step_clipping: 0.11 | _step_step: 352.41 | _step_zero_grad: 0.54 | _step_check_overflow: 0.68 samples/sec: 16.375 | iteration 14100/ 143000 | elapsed time per iteration (ms): 62532.9 | learning rate: 5.885E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.432669E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 04:57:21,170] [INFO] [logging.py:60:log_dist] [Rank 0] step=14110, skipped=12, lr=[0.0005884566975709346, 0.0005884566975709346], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14110 loss: 2.4298 iter time (s): 62.105 samples/sec: 16.488 %comms: 0.002898279250873838 %optimizer_step 0.05752529779111979 %forward: 23.391799809532873 %backward: 62.83150170225775 [2025-04-04 04:57:21,171] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14238.86 | forward: 145275.77 | backward_microstep: 390226.05 | backward: 390217.73 | backward_inner_microstep: 390201.85 | backward_inner: 390195.71 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.19 | step: 357.26 | _step_clipping: 0.13 | _step_step: 355.51 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.488 | iteration 14110/ 143000 | elapsed time per iteration (ms): 62106.0 | learning rate: 5.885E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.428532E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 05:07:41,551] [INFO] [logging.py:60:log_dist] [Rank 0] step=14120, skipped=12, lr=[0.0005884385840424995, 0.0005884385840424995], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14120 loss: 2.4340 iter time (s): 62.037 samples/sec: 16.506 %comms: 0.002903721510039135 %optimizer_step 0.057069564851598066 %forward: 23.406508514288628 %backward: 62.89162724631054 [2025-04-04 05:07:41,551] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13695.86 | forward: 145208.06 | backward_microstep: 390171.79 | backward: 390163.76 | backward_inner_microstep: 390146.19 | backward_inner: 390140.10 | backward_allreduce_microstep: 7.47 | backward_allreduce: 2.57 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.19 | step: 354.05 | _step_clipping: 0.10 | _step_step: 352.32 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.506 | iteration 14120/ 143000 | elapsed time per iteration (ms): 62038.0 | learning rate: 5.884E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.436035E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 05:18:02,345] [INFO] [logging.py:60:log_dist] [Rank 0] step=14130, skipped=12, lr=[0.0005884204565927249, 0.0005884204565927249], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14130 loss: 2.4161 iter time (s): 62.079 samples/sec: 16.495 %comms: 0.0029010950256106722 %optimizer_step 0.057363740838223 %forward: 23.39160415956115 %backward: 62.84958444696112 [2025-04-04 05:18:02,346] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14123.12 | forward: 145212.38 | backward_microstep: 390170.92 | backward: 390162.96 | backward_inner_microstep: 390145.36 | backward_inner: 390139.15 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.63 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.19 | step: 356.11 | _step_clipping: 0.10 | _step_step: 354.39 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.495 | iteration 14130/ 143000 | elapsed time per iteration (ms): 62079.5 | learning rate: 5.884E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.422098E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 05:28:23,361] [INFO] [logging.py:60:log_dist] [Rank 0] step=14140, skipped=12, lr=[0.0005884023152224857, 0.0005884023152224857], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14140 loss: 2.4265 iter time (s): 62.101 samples/sec: 16.489 %comms: 0.0029132277244996157 %optimizer_step 0.05741136779343382 %forward: 23.393568225808632 %backward: 62.83234836685878 [2025-04-04 05:28:23,362] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14229.26 | forward: 145276.44 | backward_microstep: 390204.69 | backward: 390195.27 | backward_inner_microstep: 390178.50 | backward_inner: 390171.78 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.74 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.20 | step: 356.53 | _step_clipping: 0.12 | _step_step: 354.82 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.489 | iteration 14140/ 143000 | elapsed time per iteration (ms): 62101.6 | learning rate: 5.884E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.425912E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 05:38:44,202] [INFO] [logging.py:60:log_dist] [Rank 0] step=14150, skipped=12, lr=[0.0005883841599326577, 0.0005883841599326577], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14150 loss: 2.4263 iter time (s): 62.083 samples/sec: 16.494 %comms: 0.0029451176994130103 %optimizer_step 0.05769573515770137 %forward: 23.409623490942035 %backward: 62.83133151601867 [2025-04-04 05:38:44,203] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14104.71 | forward: 145335.13 | backward_microstep: 390087.42 | backward: 390078.88 | backward_inner_microstep: 390062.80 | backward_inner: 390056.52 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 18.28 | reduce_grads: 0.21 | step: 358.20 | _step_clipping: 0.12 | _step_step: 356.52 | _step_zero_grad: 0.49 | _step_check_overflow: 0.46 samples/sec: 16.494 | iteration 14150/ 143000 | elapsed time per iteration (ms): 62084.1 | learning rate: 5.884E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.430405E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 05:49:04,687] [INFO] [logging.py:60:log_dist] [Rank 0] step=14160, skipped=12, lr=[0.0005883659907241169, 0.0005883659907241169], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14160 loss: 2.4109 iter time (s): 62.048 samples/sec: 16.503 %comms: 0.00288205756536132 %optimizer_step 0.05700986907615752 %forward: 23.386299475131494 %backward: 62.86090026953399 [2025-04-04 05:49:04,688] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14054.30 | forward: 145107.27 | backward_microstep: 390045.73 | backward: 390039.20 | backward_inner_microstep: 390023.44 | backward_inner: 390017.54 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.28 | comms: 17.88 | reduce_grads: 0.21 | step: 353.73 | _step_clipping: 0.10 | _step_step: 352.07 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.503 | iteration 14160/ 143000 | elapsed time per iteration (ms): 62048.5 | learning rate: 5.884E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.422965E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 05:59:25,289] [INFO] [logging.py:60:log_dist] [Rank 0] step=14170, skipped=12, lr=[0.0005883478075977402, 0.0005883478075977402], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14170 loss: 2.4158 iter time (s): 62.060 samples/sec: 16.500 %comms: 0.0029225844569483275 %optimizer_step 0.057620984250139964 %forward: 23.378096332850138 %backward: 62.83973567633526 [2025-04-04 05:59:25,290] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14246.44 | forward: 145083.63 | backward_microstep: 389987.73 | backward: 389981.16 | backward_inner_microstep: 389965.61 | backward_inner: 389959.79 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.58 | reduce_tied_grads: 0.28 | comms: 18.14 | reduce_grads: 0.19 | step: 357.59 | _step_clipping: 0.11 | _step_step: 355.98 | _step_zero_grad: 0.48 | _step_check_overflow: 0.47 samples/sec: 16.500 | iteration 14170/ 143000 | elapsed time per iteration (ms): 62060.2 | learning rate: 5.883E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.418448E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 06:09:45,801] [INFO] [logging.py:60:log_dist] [Rank 0] step=14180, skipped=12, lr=[0.0005883296105544055, 0.0005883296105544055], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14180 loss: 2.4206 iter time (s): 62.051 samples/sec: 16.503 %comms: 0.0028895023978668323 %optimizer_step 0.057071034388930464 %forward: 23.38676415900986 %backward: 62.8536153482858 [2025-04-04 06:09:45,802] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14108.01 | forward: 145116.42 | backward_microstep: 390017.46 | backward: 390010.85 | backward_inner_microstep: 389995.00 | backward_inner: 389989.12 | backward_allreduce_microstep: 7.79 | backward_allreduce: 2.81 | reduce_tied_grads: 0.27 | comms: 17.93 | reduce_grads: 0.19 | step: 354.13 | _step_clipping: 0.10 | _step_step: 352.49 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.502 | iteration 14180/ 143000 | elapsed time per iteration (ms): 62051.2 | learning rate: 5.883E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.429497E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 06:20:06,287] [INFO] [logging.py:60:log_dist] [Rank 0] step=14190, skipped=12, lr=[0.0005883113995949908, 0.0005883113995949908], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14190 loss: 2.4387 iter time (s): 62.048 samples/sec: 16.503 %comms: 0.0029039206338379196 %optimizer_step 0.057716104638855716 %forward: 23.384108744632297 %backward: 62.853680916369235 [2025-04-04 06:20:06,288] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14137.66 | forward: 145093.71 | backward_microstep: 390001.10 | backward: 389994.50 | backward_inner_microstep: 389978.91 | backward_inner: 389972.89 | backward_allreduce_microstep: 7.56 | backward_allreduce: 2.63 | reduce_tied_grads: 0.26 | comms: 18.02 | reduce_grads: 0.21 | step: 358.12 | _step_clipping: 0.11 | _step_step: 356.52 | _step_zero_grad: 0.51 | _step_check_overflow: 0.43 samples/sec: 16.503 | iteration 14190/ 143000 | elapsed time per iteration (ms): 62048.6 | learning rate: 5.883E-04 | approx flops per GPU: 71.2TFLOPS | lm_loss: 2.435561E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 06:30:26,131] [INFO] [logging.py:60:log_dist] [Rank 0] step=14200, skipped=12, lr=[0.0005882931747203751, 0.0005882931747203751], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14200 loss: 2.4088 iter time (s): 61.984 samples/sec: 16.520 %comms: 0.002887459534875788 %optimizer_step 0.057379238310261066 %forward: 23.412983255320672 %backward: 62.925252782979236 [2025-04-04 06:30:26,132] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13390.45 | forward: 145122.85 | backward_microstep: 390042.41 | backward: 390035.38 | backward_inner_microstep: 390019.40 | backward_inner: 390013.47 | backward_allreduce_microstep: 7.79 | backward_allreduce: 2.68 | reduce_tied_grads: 0.28 | comms: 17.90 | reduce_grads: 0.19 | step: 355.66 | _step_clipping: 0.10 | _step_step: 353.85 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.520 | iteration 14200/ 143000 | elapsed time per iteration (ms): 61984.5 | learning rate: 5.883E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.421283E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 06:40:45,789] [INFO] [logging.py:60:log_dist] [Rank 0] step=14210, skipped=12, lr=[0.0005882749359314381, 0.0005882749359314381], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14210 loss: 2.4342 iter time (s): 61.965 samples/sec: 16.525 %comms: 0.002884253658665685 %optimizer_step 0.05919334060774308 %forward: 23.414969044399975 %backward: 62.94129514796853 [2025-04-04 06:40:45,790] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13256.98 | forward: 145091.31 | backward_microstep: 390023.50 | backward: 390016.95 | backward_inner_microstep: 390001.44 | backward_inner: 389995.68 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.57 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.18 | step: 366.79 | _step_clipping: 0.10 | _step_step: 365.28 | _step_zero_grad: 0.47 | _step_check_overflow: 0.41 samples/sec: 16.525 | iteration 14210/ 143000 | elapsed time per iteration (ms): 61965.8 | learning rate: 5.883E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.423199E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 06:51:05,626] [INFO] [logging.py:60:log_dist] [Rank 0] step=14220, skipped=12, lr=[0.0005882566832290599, 0.0005882566832290599], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14220 loss: 2.4597 iter time (s): 61.983 samples/sec: 16.521 %comms: 0.002884998650232362 %optimizer_step 0.057182809599818446 %forward: 23.41620592061575 %backward: 62.92909050583528 [2025-04-04 06:51:05,626] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13368.17 | forward: 145140.84 | backward_microstep: 390060.79 | backward: 390053.85 | backward_inner_microstep: 390035.95 | backward_inner: 390030.06 | backward_allreduce_microstep: 9.75 | backward_allreduce: 2.64 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.19 | step: 354.44 | _step_clipping: 0.10 | _step_step: 352.71 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.520 | iteration 14220/ 143000 | elapsed time per iteration (ms): 61983.6 | learning rate: 5.883E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.436815E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 07:01:25,198] [INFO] [logging.py:60:log_dist] [Rank 0] step=14230, skipped=12, lr=[0.0005882384166141217, 0.0005882384166141217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14230 loss: 2.4222 iter time (s): 61.957 samples/sec: 16.528 %comms: 0.00289738989747243 %optimizer_step 0.05711725107049376 %forward: 23.422149518347787 %backward: 62.94527021121178 [2025-04-04 07:01:25,198] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13179.28 | forward: 145115.73 | backward_microstep: 389994.39 | backward: 389987.65 | backward_inner_microstep: 389971.84 | backward_inner: 389965.92 | backward_allreduce_microstep: 7.68 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 17.95 | reduce_grads: 0.19 | step: 353.88 | _step_clipping: 0.11 | _step_step: 352.28 | _step_zero_grad: 0.50 | _step_check_overflow: 0.42 samples/sec: 16.528 | iteration 14230/ 143000 | elapsed time per iteration (ms): 61957.2 | learning rate: 5.882E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.429063E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 07:11:44,918] [INFO] [logging.py:60:log_dist] [Rank 0] step=14240, skipped=12, lr=[0.0005882201360875051, 0.0005882201360875051], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14240 loss: 2.4305 iter time (s): 61.972 samples/sec: 16.524 %comms: 0.0028944231535496393 %optimizer_step 0.05744018412018383 %forward: 23.4198443407282 %backward: 62.93029967357836 [2025-04-04 07:11:44,919] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13330.30 | forward: 145136.36 | backward_microstep: 389995.74 | backward: 389988.70 | backward_inner_microstep: 389972.67 | backward_inner: 389966.69 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.65 | reduce_tied_grads: 0.27 | comms: 17.94 | reduce_grads: 0.20 | step: 355.97 | _step_clipping: 0.10 | _step_step: 354.21 | _step_zero_grad: 0.58 | _step_check_overflow: 0.50 samples/sec: 16.524 | iteration 14240/ 143000 | elapsed time per iteration (ms): 61972.1 | learning rate: 5.882E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.423536E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 07:22:04,437] [INFO] [logging.py:60:log_dist] [Rank 0] step=14250, skipped=12, lr=[0.0005882018416500922, 0.0005882018416500922], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14250 loss: 2.4344 iter time (s): 61.951 samples/sec: 16.529 %comms: 0.0028955591882187303 %optimizer_step 0.057480885019262726 %forward: 23.419618032528472 %backward: 62.946325511664256 [2025-04-04 07:22:04,438] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13204.91 | forward: 145087.66 | backward_microstep: 389967.74 | backward: 389960.88 | backward_inner_microstep: 389945.14 | backward_inner: 389939.29 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.61 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 356.10 | _step_clipping: 0.10 | _step_step: 354.40 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.529 | iteration 14250/ 143000 | elapsed time per iteration (ms): 61951.9 | learning rate: 5.882E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.427607E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 07:32:24,112] [INFO] [logging.py:60:log_dist] [Rank 0] step=14260, skipped=12, lr=[0.000588183533302766, 0.000588183533302766], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14260 loss: 2.4204 iter time (s): 61.967 samples/sec: 16.525 %comms: 0.002893560831144942 %optimizer_step 0.05718085731655049 %forward: 23.41829317106412 %backward: 62.93864407560158 [2025-04-04 07:32:24,113] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13244.67 | forward: 145115.97 | backward_microstep: 390018.37 | backward: 390011.44 | backward_inner_microstep: 389995.40 | backward_inner: 389989.52 | backward_allreduce_microstep: 7.69 | backward_allreduce: 2.63 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.19 | step: 354.33 | _step_clipping: 0.10 | _step_step: 352.80 | _step_zero_grad: 0.47 | _step_check_overflow: 0.41 samples/sec: 16.525 | iteration 14260/ 143000 | elapsed time per iteration (ms): 61967.5 | learning rate: 5.882E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.423092E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 07:42:43,891] [INFO] [logging.py:60:log_dist] [Rank 0] step=14270, skipped=12, lr=[0.0005881652110464103, 0.0005881652110464103], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14270 loss: 2.4387 iter time (s): 61.977 samples/sec: 16.522 %comms: 0.0029019602286452606 %optimizer_step 0.05776215329081945 %forward: 23.416041838799732 %backward: 62.92751235833464 [2025-04-04 07:42:43,892] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13355.66 | forward: 145126.44 | backward_microstep: 390015.07 | backward: 390008.09 | backward_inner_microstep: 389992.19 | backward_inner: 389986.24 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 357.99 | _step_clipping: 0.11 | _step_step: 356.36 | _step_zero_grad: 0.53 | _step_check_overflow: 0.42 samples/sec: 16.522 | iteration 14270/ 143000 | elapsed time per iteration (ms): 61977.9 | learning rate: 5.882E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.427158E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 07:53:03,629] [INFO] [logging.py:60:log_dist] [Rank 0] step=14280, skipped=12, lr=[0.0005881468748819094, 0.0005881468748819094], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14280 loss: 2.4324 iter time (s): 61.973 samples/sec: 16.523 %comms: 0.0029022268057752112 %optimizer_step 0.057088258172912805 %forward: 23.41660865338838 %backward: 62.93609476426416 [2025-04-04 07:53:03,630] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13264.74 | forward: 145120.47 | backward_microstep: 390043.11 | backward: 390035.79 | backward_inner_microstep: 390020.44 | backward_inner: 390014.63 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.57 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.20 | step: 353.79 | _step_clipping: 0.11 | _step_step: 352.17 | _step_zero_grad: 0.48 | _step_check_overflow: 0.46 samples/sec: 16.523 | iteration 14280/ 143000 | elapsed time per iteration (ms): 61973.8 | learning rate: 5.881E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.424800E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 08:03:29,435] [INFO] [logging.py:60:log_dist] [Rank 0] step=14290, skipped=12, lr=[0.0005881285248101482, 0.0005881285248101482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14290 loss: 2.4370 iter time (s): 62.580 samples/sec: 16.363 %comms: 0.0028672717184513694 %optimizer_step 0.056307448480518114 %forward: 23.20954642414574 %backward: 62.34687033649742 [2025-04-04 08:03:29,436] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19064.12 | forward: 145245.31 | backward_microstep: 390176.84 | backward: 390166.62 | backward_inner_microstep: 390150.17 | backward_inner: 390143.60 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.69 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.20 | step: 352.37 | _step_clipping: 0.13 | _step_step: 350.70 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.363 | iteration 14290/ 143000 | elapsed time per iteration (ms): 62580.6 | learning rate: 5.881E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.429765E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 08:13:51,719] [INFO] [logging.py:60:log_dist] [Rank 0] step=14300, skipped=12, lr=[0.0005881101608320124, 0.0005881101608320124], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14300 loss: 2.4257 iter time (s): 62.228 samples/sec: 16.456 %comms: 0.002940893174728201 %optimizer_step 0.05737917907455511 %forward: 23.366075424214998 %backward: 62.73586151094885 [2025-04-04 08:13:51,720] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15103.39 | forward: 145401.96 | backward_microstep: 390401.02 | backward: 390391.51 | backward_inner_microstep: 390375.19 | backward_inner: 390368.84 | backward_allreduce_microstep: 7.64 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 18.30 | reduce_grads: 0.20 | step: 357.06 | _step_clipping: 0.11 | _step_step: 355.29 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.456 | iteration 14300/ 143000 | elapsed time per iteration (ms): 62228.4 | learning rate: 5.881E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.424987E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 08:24:21,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=14310, skipped=12, lr=[0.0005880917829483883, 0.0005880917829483883], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14310 loss: 2.4367 iter time (s): 62.979 samples/sec: 16.260 %comms: 0.0028798275351618174 %optimizer_step 0.05727185034739664 %forward: 23.091258801042517 %backward: 61.95093212416744 [2025-04-04 08:24:21,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22847.87 | forward: 145425.43 | backward_microstep: 390167.84 | backward: 390158.06 | backward_inner_microstep: 390141.49 | backward_inner: 390135.11 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.71 | reduce_tied_grads: 0.29 | comms: 18.14 | reduce_grads: 0.20 | step: 360.69 | _step_clipping: 0.11 | _step_step: 358.93 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.259 | iteration 14310/ 143000 | elapsed time per iteration (ms): 62979.1 | learning rate: 5.881E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.422842E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 08:34:47,268] [INFO] [logging.py:60:log_dist] [Rank 0] step=14320, skipped=12, lr=[0.0005880733911601628, 0.0005880733911601628], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14320 loss: 2.4351 iter time (s): 62.575 samples/sec: 16.364 %comms: 0.0028748448047496225 %optimizer_step 0.05809245490833375 %forward: 23.204967257539703 %backward: 62.39588387178772 [2025-04-04 08:34:47,268] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18770.08 | forward: 145205.53 | backward_microstep: 390450.90 | backward: 390443.45 | backward_inner_microstep: 390427.72 | backward_inner: 390419.73 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.64 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 363.51 | _step_clipping: 0.13 | _step_step: 361.82 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.364 | iteration 14320/ 143000 | elapsed time per iteration (ms): 62575.7 | learning rate: 5.881E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.433127E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 08:45:15,421] [INFO] [logging.py:60:log_dist] [Rank 0] step=14330, skipped=12, lr=[0.0005880549854682238, 0.0005880549854682238], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14330 loss: 2.4269 iter time (s): 62.815 samples/sec: 16.302 %comms: 0.0028665750052448676 %optimizer_step 0.05877226490316134 %forward: 23.12421170183696 %backward: 62.09308060846339 [2025-04-04 08:45:15,422] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21554.36 | forward: 145254.20 | backward_microstep: 390043.88 | backward: 390036.25 | backward_inner_microstep: 390018.62 | backward_inner: 390012.62 | backward_allreduce_microstep: 7.62 | backward_allreduce: 2.62 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.22 | step: 369.18 | _step_clipping: 0.13 | _step_step: 367.42 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.302 | iteration 14330/ 143000 | elapsed time per iteration (ms): 62815.3 | learning rate: 5.881E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.424162E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 08:55:39,971] [INFO] [logging.py:60:log_dist] [Rank 0] step=14340, skipped=12, lr=[0.0005880365658734595, 0.0005880365658734595], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14340 loss: 2.4382 iter time (s): 62.454 samples/sec: 16.396 %comms: 0.002850130619623367 %optimizer_step 0.0548320695747531 %forward: 23.24272041499844 %backward: 62.44387119925165 [2025-04-04 08:55:39,971] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18090.23 | forward: 145161.11 | backward_microstep: 389996.54 | backward: 389989.70 | backward_inner_microstep: 389974.21 | backward_inner: 389968.25 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.24 | comms: 17.80 | reduce_grads: 0.18 | step: 342.45 | _step_clipping: 0.10 | _step_step: 340.85 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.396 | iteration 14340/ 143000 | elapsed time per iteration (ms): 62454.9 | learning rate: 5.880E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.431258E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 09:06:09,561] [INFO] [logging.py:60:log_dist] [Rank 0] step=14350, skipped=12, lr=[0.0005880181323767589, 0.0005880181323767589], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14350 loss: 2.4415 iter time (s): 62.959 samples/sec: 16.265 %comms: 0.0028720343536707254 %optimizer_step 0.056345321853020186 %forward: 23.103095438265445 %backward: 61.979665790767825 [2025-04-04 09:06:09,562] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22557.58 | forward: 145453.67 | backward_microstep: 390224.82 | backward: 390214.81 | backward_inner_microstep: 390196.34 | backward_inner: 390189.95 | backward_allreduce_microstep: 8.01 | backward_allreduce: 2.77 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.20 | step: 354.74 | _step_clipping: 0.13 | _step_step: 352.96 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.265 | iteration 14350/ 143000 | elapsed time per iteration (ms): 62959.1 | learning rate: 5.880E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.435189E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 09:16:32,376] [INFO] [logging.py:60:log_dist] [Rank 0] step=14360, skipped=12, lr=[0.0005879996849790118, 0.0005879996849790118], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14360 loss: 2.4216 iter time (s): 62.281 samples/sec: 16.442 %comms: 0.0028813837412515 %optimizer_step 0.056615100174910504 %forward: 23.34158384078654 %backward: 62.651074929070575 [2025-04-04 09:16:32,376] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15873.66 | forward: 145373.57 | backward_microstep: 390207.37 | backward: 390196.77 | backward_inner_microstep: 390180.73 | backward_inner: 390174.35 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.66 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.19 | step: 352.60 | _step_clipping: 0.12 | _step_step: 350.92 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.441 | iteration 14360/ 143000 | elapsed time per iteration (ms): 62281.5 | learning rate: 5.880E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.428157E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 09:27:00,464] [INFO] [logging.py:60:log_dist] [Rank 0] step=14370, skipped=12, lr=[0.0005879812236811085, 0.0005879812236811085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14370 loss: 2.4286 iter time (s): 62.808 samples/sec: 16.304 %comms: 0.002880271541905197 %optimizer_step 0.05636832276120725 %forward: 23.17008877895627 %backward: 62.151490422202414 [2025-04-04 09:27:00,464] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20796.52 | forward: 145527.31 | backward_microstep: 390375.95 | backward: 390362.74 | backward_inner_microstep: 390346.20 | backward_inner: 390339.88 | backward_allreduce_microstep: 7.88 | backward_allreduce: 2.68 | reduce_tied_grads: 0.31 | comms: 18.09 | reduce_grads: 0.20 | step: 354.04 | _step_clipping: 0.12 | _step_step: 352.15 | _step_zero_grad: 0.49 | _step_check_overflow: 0.69 samples/sec: 16.303 | iteration 14370/ 143000 | elapsed time per iteration (ms): 62808.8 | learning rate: 5.880E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.430133E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 09:37:31,613] [INFO] [logging.py:60:log_dist] [Rank 0] step=14380, skipped=12, lr=[0.0005879627484839399, 0.0005879627484839399], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14380 loss: 2.4191 iter time (s): 63.114 samples/sec: 16.225 %comms: 0.002837480000051647 %optimizer_step 0.0548139758137623 %forward: 23.052666363002338 %backward: 61.84064232266779 [2025-04-04 09:37:31,613] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23958.77 | forward: 145495.43 | backward_microstep: 390314.47 | backward: 390303.25 | backward_inner_microstep: 390285.97 | backward_inner: 390279.47 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.79 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.18 | step: 345.95 | _step_clipping: 0.13 | _step_step: 344.28 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.224 | iteration 14380/ 143000 | elapsed time per iteration (ms): 63114.9 | learning rate: 5.880E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.426662E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 09:48:01,317] [INFO] [logging.py:60:log_dist] [Rank 0] step=14390, skipped=12, lr=[0.0005879442593883978, 0.0005879442593883978], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14390 loss: 2.4156 iter time (s): 62.970 samples/sec: 16.262 %comms: 0.002849669094103076 %optimizer_step 0.05564949695930835 %forward: 23.10690725871351 %backward: 61.97585268769499 [2025-04-04 09:48:01,318] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22537.96 | forward: 145503.95 | backward_microstep: 390272.48 | backward: 390261.28 | backward_inner_microstep: 390244.61 | backward_inner: 390238.12 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.73 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 350.42 | _step_clipping: 0.14 | _step_step: 348.74 | _step_zero_grad: 0.50 | _step_check_overflow: 0.46 samples/sec: 16.262 | iteration 14390/ 143000 | elapsed time per iteration (ms): 62970.4 | learning rate: 5.879E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.424155E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 09:58:22,199] [INFO] [logging.py:60:log_dist] [Rank 0] step=14400, skipped=12, lr=[0.0005879257563953746, 0.0005879257563953746], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14400 loss: 2.4168 iter time (s): 62.088 samples/sec: 16.493 %comms: 0.002901334827795268 %optimizer_step 0.05565601550331097 %forward: 23.413425881584065 %backward: 62.86282104508759 [2025-04-04 09:58:22,200] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13798.09 | forward: 145368.54 | backward_microstep: 390315.22 | backward: 390300.69 | backward_inner_microstep: 390283.95 | backward_inner: 390277.51 | backward_allreduce_microstep: 7.85 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.19 | step: 345.56 | _step_clipping: 0.11 | _step_step: 343.86 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.493 | iteration 14400/ 143000 | elapsed time per iteration (ms): 62088.2 | learning rate: 5.879E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.417953E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 10:08:48,271] [INFO] [logging.py:60:log_dist] [Rank 0] step=14410, skipped=12, lr=[0.0005879072395057632, 0.0005879072395057632], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14410 loss: 2.4124 iter time (s): 62.607 samples/sec: 16.356 %comms: 0.002897887579106445 %optimizer_step 0.05714877573789323 %forward: 23.215572960559538 %backward: 62.34035390292115 [2025-04-04 10:08:48,272] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19001.69 | forward: 145344.90 | backward_microstep: 390304.96 | backward: 390292.00 | backward_inner_microstep: 390275.42 | backward_inner: 390269.06 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.67 | reduce_tied_grads: 0.37 | comms: 18.14 | reduce_grads: 0.21 | step: 357.79 | _step_clipping: 0.12 | _step_step: 355.91 | _step_zero_grad: 0.50 | _step_check_overflow: 0.68 samples/sec: 16.356 | iteration 14410/ 143000 | elapsed time per iteration (ms): 62607.2 | learning rate: 5.879E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.422566E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 10:19:18,657] [INFO] [logging.py:60:log_dist] [Rank 0] step=14420, skipped=12, lr=[0.0005878887087204575, 0.0005878887087204575], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14420 loss: 2.4249 iter time (s): 63.038 samples/sec: 16.244 %comms: 0.0028698512311617907 %optimizer_step 0.05694850044987629 %forward: 23.064764920137335 %backward: 61.91225938275109 [2025-04-04 10:19:18,658] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23326.42 | forward: 145395.63 | backward_microstep: 390294.31 | backward: 390282.40 | backward_inner_microstep: 390265.28 | backward_inner: 390258.61 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.09 | reduce_grads: 0.22 | step: 358.99 | _step_clipping: 0.13 | _step_step: 357.22 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.244 | iteration 14420/ 143000 | elapsed time per iteration (ms): 63038.6 | learning rate: 5.879E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.418589E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 10:29:44,416] [INFO] [logging.py:60:log_dist] [Rank 0] step=14430, skipped=12, lr=[0.0005878701640403518, 0.0005878701640403518], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14430 loss: 2.4259 iter time (s): 62.575 samples/sec: 16.364 %comms: 0.002910200182600821 %optimizer_step 0.057369437139406114 %forward: 23.221982877583986 %backward: 62.3526202603115 [2025-04-04 10:29:44,416] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18876.63 | forward: 145312.13 | backward_microstep: 390182.32 | backward: 390173.06 | backward_inner_microstep: 390156.76 | backward_inner: 390150.49 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.67 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 358.99 | _step_clipping: 0.15 | _step_step: 355.28 | _step_zero_grad: 0.58 | _step_check_overflow: 0.56 samples/sec: 16.364 | iteration 14430/ 143000 | elapsed time per iteration (ms): 62575.9 | learning rate: 5.879E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.418310E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 10:40:07,482] [INFO] [logging.py:60:log_dist] [Rank 0] step=14440, skipped=12, lr=[0.000587851605466341, 0.000587851605466341], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14440 loss: 2.4276 iter time (s): 62.306 samples/sec: 16.435 %comms: 0.002866024810630887 %optimizer_step 0.05549617870284675 %forward: 23.29884611624774 %backward: 62.58317264422 [2025-04-04 10:40:07,483] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16647.35 | forward: 145165.96 | backward_microstep: 389937.77 | backward: 389931.16 | backward_inner_microstep: 389915.81 | backward_inner: 389910.03 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.25 | comms: 17.86 | reduce_grads: 0.18 | step: 345.77 | _step_clipping: 0.11 | _step_step: 344.12 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.435 | iteration 14440/ 143000 | elapsed time per iteration (ms): 62306.6 | learning rate: 5.879E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.420651E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 10:50:26,582] [INFO] [logging.py:60:log_dist] [Rank 0] step=14450, skipped=12, lr=[0.0005878330329993212, 0.0005878330329993212], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14450 loss: 2.4337 iter time (s): 61.909 samples/sec: 16.540 %comms: 0.0029402275085103928 %optimizer_step 0.056088455895043854 %forward: 23.4422472631915 %backward: 62.9755234940522 [2025-04-04 10:50:26,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12753.91 | forward: 145129.62 | backward_microstep: 389884.95 | backward: 389877.89 | backward_inner_microstep: 389862.77 | backward_inner: 389856.82 | backward_allreduce_microstep: 7.16 | backward_allreduce: 2.45 | reduce_tied_grads: 0.28 | comms: 18.20 | reduce_grads: 0.18 | step: 347.24 | _step_clipping: 0.10 | _step_step: 345.58 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.540 | iteration 14450/ 143000 | elapsed time per iteration (ms): 61910.0 | learning rate: 5.878E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.422111E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 11:00:45,574] [INFO] [logging.py:60:log_dist] [Rank 0] step=14460, skipped=12, lr=[0.0005878144466401884, 0.0005878144466401884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14460 loss: 2.4277 iter time (s): 61.899 samples/sec: 16.543 %comms: 0.0028845392576814536 %optimizer_step 0.058457553610981476 %forward: 23.438983909485444 %backward: 62.990853855497384 [2025-04-04 11:00:45,575] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12692.05 | forward: 145084.31 | backward_microstep: 389911.88 | backward: 389905.33 | backward_inner_microstep: 389890.19 | backward_inner: 389884.43 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.52 | reduce_tied_grads: 0.26 | comms: 17.85 | reduce_grads: 0.18 | step: 361.84 | _step_clipping: 0.10 | _step_step: 360.35 | _step_zero_grad: 0.45 | _step_check_overflow: 0.40 samples/sec: 16.543 | iteration 14460/ 143000 | elapsed time per iteration (ms): 61899.2 | learning rate: 5.878E-04 | approx flops per GPU: 71.4TFLOPS | lm_loss: 2.416052E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 11:11:04,914] [INFO] [logging.py:60:log_dist] [Rank 0] step=14470, skipped=12, lr=[0.0005877958463898397, 0.0005877958463898397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14470 loss: 2.4276 iter time (s): 61.933 samples/sec: 16.534 %comms: 0.0028955493086974863 %optimizer_step 0.055899534681218666 %forward: 23.43094768983316 %backward: 62.96179445854941 [2025-04-04 11:11:04,914] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12970.65 | forward: 145115.90 | backward_microstep: 389950.62 | backward: 389943.99 | backward_inner_microstep: 389928.83 | backward_inner: 389923.08 | backward_allreduce_microstep: 7.29 | backward_allreduce: 2.50 | reduce_tied_grads: 0.26 | comms: 17.93 | reduce_grads: 0.20 | step: 346.20 | _step_clipping: 0.10 | _step_step: 344.62 | _step_zero_grad: 0.46 | _step_check_overflow: 0.48 samples/sec: 16.534 | iteration 14470/ 143000 | elapsed time per iteration (ms): 61933.9 | learning rate: 5.878E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.431117E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 11:21:24,209] [INFO] [logging.py:60:log_dist] [Rank 0] step=14480, skipped=12, lr=[0.0005877772322491731, 0.0005877772322491731], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14480 loss: 2.4291 iter time (s): 61.929 samples/sec: 16.535 %comms: 0.002914888634743996 %optimizer_step 0.056296111115961306 %forward: 23.42983557742434 %backward: 62.969039076139524 [2025-04-04 11:21:24,210] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12925.98 | forward: 145098.71 | backward_microstep: 389967.62 | backward: 389961.19 | backward_inner_microstep: 389945.99 | backward_inner: 389940.33 | backward_allreduce_microstep: 7.25 | backward_allreduce: 2.48 | reduce_tied_grads: 0.27 | comms: 18.05 | reduce_grads: 0.18 | step: 348.64 | _step_clipping: 0.10 | _step_step: 347.06 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.535 | iteration 14480/ 143000 | elapsed time per iteration (ms): 61929.6 | learning rate: 5.878E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.424879E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 11:31:43,480] [INFO] [logging.py:60:log_dist] [Rank 0] step=14490, skipped=12, lr=[0.0005877586042190867, 0.0005877586042190867], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14490 loss: 2.4292 iter time (s): 61.926 samples/sec: 16.536 %comms: 0.0028881011176057347 %optimizer_step 0.05577906363636074 %forward: 23.43659645355651 %backward: 62.965582993518034 [2025-04-04 11:31:43,481] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12877.53 | forward: 145134.40 | backward_microstep: 389929.70 | backward: 389923.17 | backward_inner_microstep: 389908.16 | backward_inner: 389902.45 | backward_allreduce_microstep: 7.23 | backward_allreduce: 2.47 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 345.42 | _step_clipping: 0.09 | _step_step: 343.81 | _step_zero_grad: 0.45 | _step_check_overflow: 0.52 samples/sec: 16.536 | iteration 14490/ 143000 | elapsed time per iteration (ms): 61927.1 | learning rate: 5.878E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.422402E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 11:42:02,758] [INFO] [logging.py:60:log_dist] [Rank 0] step=14500, skipped=12, lr=[0.0005877399623004799, 0.0005877399623004799], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14500 loss: 2.4055 iter time (s): 61.927 samples/sec: 16.536 %comms: 0.002892219481572551 %optimizer_step 0.05595328010527603 %forward: 23.436917098189312 %backward: 62.977393761038634 [2025-04-04 11:42:02,759] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12814.76 | forward: 145138.37 | backward_microstep: 390008.26 | backward: 390001.66 | backward_inner_microstep: 389986.83 | backward_inner: 389981.14 | backward_allreduce_microstep: 7.15 | backward_allreduce: 2.45 | reduce_tied_grads: 0.25 | comms: 17.91 | reduce_grads: 0.18 | step: 346.50 | _step_clipping: 0.10 | _step_step: 344.97 | _step_zero_grad: 0.47 | _step_check_overflow: 0.41 samples/sec: 16.535 | iteration 14500/ 143000 | elapsed time per iteration (ms): 61927.8 | learning rate: 5.877E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.418688E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 11:52:22,137] [INFO] [logging.py:60:log_dist] [Rank 0] step=14510, skipped=12, lr=[0.0005877213064942521, 0.0005877213064942521], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14510 loss: 2.4084 iter time (s): 61.937 samples/sec: 16.533 %comms: 0.0029129965745516087 %optimizer_step 0.05595393911845089 %forward: 23.43059562071562 %backward: 62.96120732802418 [2025-04-04 11:52:22,137] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12942.82 | forward: 145122.88 | backward_microstep: 389973.74 | backward: 389964.97 | backward_inner_microstep: 389949.76 | backward_inner: 389942.05 | backward_allreduce_microstep: 7.32 | backward_allreduce: 2.48 | reduce_tied_grads: 0.26 | comms: 18.04 | reduce_grads: 0.18 | step: 346.56 | _step_clipping: 0.10 | _step_step: 344.92 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.533 | iteration 14510/ 143000 | elapsed time per iteration (ms): 61937.9 | learning rate: 5.877E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.417280E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 12:02:41,566] [INFO] [logging.py:60:log_dist] [Rank 0] step=14520, skipped=12, lr=[0.0005877026368013039, 0.0005877026368013039], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14520 loss: 2.4367 iter time (s): 61.942 samples/sec: 16.531 %comms: 0.002886352840485353 %optimizer_step 0.05622173771972664 %forward: 23.4410673553824 %backward: 62.96721690074062 [2025-04-04 12:02:41,567] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12906.02 | forward: 145199.67 | backward_microstep: 390040.75 | backward: 390034.24 | backward_inner_microstep: 390019.23 | backward_inner: 390013.52 | backward_allreduce_microstep: 7.21 | backward_allreduce: 2.46 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 348.25 | _step_clipping: 0.09 | _step_step: 346.66 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.531 | iteration 14520/ 143000 | elapsed time per iteration (ms): 61943.0 | learning rate: 5.877E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.430517E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 12:13:01,029] [INFO] [logging.py:60:log_dist] [Rank 0] step=14530, skipped=12, lr=[0.0005876839532225364, 0.0005876839532225364], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14530 loss: 2.4200 iter time (s): 61.946 samples/sec: 16.531 %comms: 0.0029124086027043222 %optimizer_step 0.05586478494264907 %forward: 23.43241269653396 %backward: 62.963326480547885 [2025-04-04 12:13:01,030] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 12932.26 | forward: 145153.84 | backward_microstep: 390039.46 | backward: 390031.06 | backward_inner_microstep: 390013.99 | backward_inner: 390008.20 | backward_allreduce_microstep: 8.99 | backward_allreduce: 4.22 | reduce_tied_grads: 0.25 | comms: 18.04 | reduce_grads: 0.18 | step: 346.06 | _step_clipping: 0.10 | _step_step: 344.49 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.530 | iteration 14530/ 143000 | elapsed time per iteration (ms): 61946.3 | learning rate: 5.877E-04 | approx flops per GPU: 71.3TFLOPS | lm_loss: 2.419997E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 12:23:25,052] [INFO] [logging.py:60:log_dist] [Rank 0] step=14540, skipped=12, lr=[0.0005876652557588513, 0.0005876652557588513], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14540 loss: 2.4288 iter time (s): 62.402 samples/sec: 16.410 %comms: 0.0028975477082292317 %optimizer_step 0.06012222369583492 %forward: 23.281048887420503 %backward: 62.53136233955284 [2025-04-04 12:23:25,053] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17122.27 | forward: 145277.69 | backward_microstep: 390216.22 | backward: 390206.29 | backward_inner_microstep: 390187.87 | backward_inner: 390181.20 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.06 | reduce_tied_grads: 0.34 | comms: 18.08 | reduce_grads: 0.22 | step: 375.17 | _step_clipping: 0.11 | _step_step: 373.27 | _step_zero_grad: 0.59 | _step_check_overflow: 0.54 samples/sec: 16.410 | iteration 14540/ 143000 | elapsed time per iteration (ms): 62402.3 | learning rate: 5.877E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.418627E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 12:33:48,991] [INFO] [logging.py:60:log_dist] [Rank 0] step=14550, skipped=12, lr=[0.0005876465444111509, 0.0005876465444111509], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14550 loss: 2.3997 iter time (s): 62.393 samples/sec: 16.412 %comms: 0.0028981280124018407 %optimizer_step 0.057562516708595655 %forward: 23.274112026223328 %backward: 62.542261629898576 [2025-04-04 12:33:48,992] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17079.82 | forward: 145214.89 | backward_microstep: 390232.19 | backward: 390221.88 | backward_inner_microstep: 390203.38 | backward_inner: 390196.41 | backward_allreduce_microstep: 9.02 | backward_allreduce: 3.15 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.23 | step: 359.15 | _step_clipping: 0.10 | _step_step: 357.39 | _step_zero_grad: 0.58 | _step_check_overflow: 0.46 samples/sec: 16.412 | iteration 14550/ 143000 | elapsed time per iteration (ms): 62394.0 | learning rate: 5.876E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.415815E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 12:44:12,710] [INFO] [logging.py:60:log_dist] [Rank 0] step=14560, skipped=12, lr=[0.0005876278191803385, 0.0005876278191803385], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14560 loss: 2.4248 iter time (s): 62.371 samples/sec: 16.418 %comms: 0.0029004537976457844 %optimizer_step 0.058981423874561456 %forward: 23.28309796819907 %backward: 62.55520290114278 [2025-04-04 12:44:12,711] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16914.38 | forward: 145219.54 | backward_microstep: 390174.13 | backward: 390164.48 | backward_inner_microstep: 390146.03 | backward_inner: 390139.30 | backward_allreduce_microstep: 8.98 | backward_allreduce: 3.08 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.24 | step: 367.87 | _step_clipping: 0.12 | _step_step: 366.01 | _step_zero_grad: 0.57 | _step_check_overflow: 0.50 samples/sec: 16.418 | iteration 14560/ 143000 | elapsed time per iteration (ms): 62371.8 | learning rate: 5.876E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.419140E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 12:54:36,600] [INFO] [logging.py:60:log_dist] [Rank 0] step=14570, skipped=12, lr=[0.0005876090800673178, 0.0005876090800673178], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14570 loss: 2.4298 iter time (s): 62.388 samples/sec: 16.413 %comms: 0.002883754896586083 %optimizer_step 0.056324191694137764 %forward: 23.27955953287489 %backward: 62.54043955974413 [2025-04-04 12:54:36,601] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17071.38 | forward: 145237.60 | backward_microstep: 390190.41 | backward: 390180.21 | backward_inner_microstep: 390163.10 | backward_inner: 390156.60 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.86 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.22 | step: 351.40 | _step_clipping: 0.11 | _step_step: 349.76 | _step_zero_grad: 0.46 | _step_check_overflow: 0.49 samples/sec: 16.413 | iteration 14570/ 143000 | elapsed time per iteration (ms): 62389.0 | learning rate: 5.876E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.420306E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 13:04:59,313] [INFO] [logging.py:60:log_dist] [Rank 0] step=14580, skipped=12, lr=[0.0005875903270729933, 0.0005875903270729933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14580 loss: 2.4110 iter time (s): 62.271 samples/sec: 16.444 %comms: 0.0029006979165870166 %optimizer_step 0.057035981181446885 %forward: 23.32372157878724 %backward: 62.6572932218312 [2025-04-04 13:04:59,313] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15923.20 | forward: 145238.29 | backward_microstep: 390180.87 | backward: 390170.94 | backward_inner_microstep: 390154.38 | backward_inner: 390147.99 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.20 | step: 355.17 | _step_clipping: 0.10 | _step_step: 353.48 | _step_zero_grad: 0.54 | _step_check_overflow: 0.46 samples/sec: 16.444 | iteration 14580/ 143000 | elapsed time per iteration (ms): 62271.2 | learning rate: 5.876E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.416426E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 13:15:21,760] [INFO] [logging.py:60:log_dist] [Rank 0] step=14590, skipped=12, lr=[0.0005875715601982699, 0.0005875715601982699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14590 loss: 2.4252 iter time (s): 62.244 samples/sec: 16.451 %comms: 0.002901546606694409 %optimizer_step 0.056942071717675825 %forward: 23.3360486586474 %backward: 62.69373254591216 [2025-04-04 13:15:21,761] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15528.71 | forward: 145253.38 | backward_microstep: 390242.41 | backward: 390232.15 | backward_inner_microstep: 390215.74 | backward_inner: 390207.58 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.72 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.20 | step: 354.43 | _step_clipping: 0.11 | _step_step: 352.78 | _step_zero_grad: 0.50 | _step_check_overflow: 0.44 samples/sec: 16.451 | iteration 14590/ 143000 | elapsed time per iteration (ms): 62244.7 | learning rate: 5.876E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.415697E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 13:25:43,970] [INFO] [logging.py:60:log_dist] [Rank 0] step=14600, skipped=12, lr=[0.0005875527794440535, 0.0005875527794440535], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14600 loss: 2.4254 iter time (s): 62.220 samples/sec: 16.458 %comms: 0.0028957183038183466 %optimizer_step 0.05675519743260872 %forward: 23.359964981619115 %backward: 62.738582305250986 [2025-04-04 13:25:43,971] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15042.21 | forward: 145346.77 | backward_microstep: 390375.79 | backward: 390362.33 | backward_inner_microstep: 390345.02 | backward_inner: 390338.19 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.81 | reduce_tied_grads: 0.35 | comms: 18.02 | reduce_grads: 0.20 | step: 353.13 | _step_clipping: 0.10 | _step_step: 351.50 | _step_zero_grad: 0.52 | _step_check_overflow: 0.44 samples/sec: 16.457 | iteration 14600/ 143000 | elapsed time per iteration (ms): 62221.0 | learning rate: 5.876E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.420828E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 13:36:09,926] [INFO] [logging.py:60:log_dist] [Rank 0] step=14610, skipped=12, lr=[0.0005875339848112506, 0.0005875339848112506], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14610 loss: 2.4259 iter time (s): 62.595 samples/sec: 16.359 %comms: 0.002877741803611971 %optimizer_step 0.0572608648038546 %forward: 23.237771958284256 %backward: 62.335056208361586 [2025-04-04 13:36:09,927] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18886.29 | forward: 145456.95 | backward_microstep: 390197.71 | backward: 390186.59 | backward_inner_microstep: 390167.97 | backward_inner: 390161.20 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.17 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.23 | step: 358.42 | _step_clipping: 0.11 | _step_step: 356.60 | _step_zero_grad: 0.56 | _step_check_overflow: 0.56 samples/sec: 16.359 | iteration 14610/ 143000 | elapsed time per iteration (ms): 62595.6 | learning rate: 5.875E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.418902E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 13:46:38,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=14620, skipped=12, lr=[0.0005875151763007682, 0.0005875151763007682], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14620 loss: 2.4258 iter time (s): 62.885 samples/sec: 16.284 %comms: 0.002905279252087118 %optimizer_step 0.05774548191712474 %forward: 23.127436947091663 %backward: 62.078748744994364 [2025-04-04 13:46:38,781] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21608.14 | forward: 145436.32 | backward_microstep: 390391.93 | backward: 390380.69 | backward_inner_microstep: 390360.78 | backward_inner: 390353.86 | backward_allreduce_microstep: 10.40 | backward_allreduce: 4.65 | reduce_tied_grads: 0.33 | comms: 18.27 | reduce_grads: 0.24 | step: 363.13 | _step_clipping: 0.12 | _step_step: 361.21 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.284 | iteration 14620/ 143000 | elapsed time per iteration (ms): 62885.4 | learning rate: 5.875E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.413759E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 13:57:10,671] [INFO] [logging.py:60:log_dist] [Rank 0] step=14630, skipped=12, lr=[0.0005874963539135141, 0.0005874963539135141], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14630 loss: 2.4190 iter time (s): 63.188 samples/sec: 16.206 %comms: 0.0029184865308330035 %optimizer_step 0.05798913392327667 %forward: 23.07476542678186 %backward: 61.84689622979613 [2025-04-04 13:57:10,672] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23746.01 | forward: 145805.67 | backward_microstep: 390822.05 | backward: 390800.41 | backward_inner_microstep: 390781.36 | backward_inner: 390773.85 | backward_allreduce_microstep: 8.80 | backward_allreduce: 3.02 | reduce_tied_grads: 0.37 | comms: 18.44 | reduce_grads: 0.21 | step: 366.42 | _step_clipping: 0.16 | _step_step: 364.49 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 16.205 | iteration 14630/ 143000 | elapsed time per iteration (ms): 63189.0 | learning rate: 5.875E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.412827E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 14:07:38,369] [INFO] [logging.py:60:log_dist] [Rank 0] step=14640, skipped=12, lr=[0.0005874775176503968, 0.0005874775176503968], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14640 loss: 2.4171 iter time (s): 62.769 samples/sec: 16.314 %comms: 0.002970416335958757 %optimizer_step 0.05842022638333597 %forward: 23.208845691471517 %backward: 62.239505990611065 [2025-04-04 14:07:38,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19815.97 | forward: 145679.93 | backward_microstep: 390690.17 | backward: 390672.03 | backward_inner_microstep: 390653.98 | backward_inner: 390646.91 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.84 | reduce_tied_grads: 0.37 | comms: 18.65 | reduce_grads: 0.23 | step: 366.70 | _step_clipping: 0.12 | _step_step: 364.55 | _step_zero_grad: 0.59 | _step_check_overflow: 0.73 samples/sec: 16.314 | iteration 14640/ 143000 | elapsed time per iteration (ms): 62769.8 | learning rate: 5.875E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.416080E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 14:18:04,791] [INFO] [logging.py:60:log_dist] [Rank 0] step=14650, skipped=12, lr=[0.0005874586675123255, 0.0005874586675123255], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14650 loss: 2.4345 iter time (s): 62.642 samples/sec: 16.347 %comms: 0.00291651835252819 %optimizer_step 0.05724065001565183 %forward: 23.255063013326193 %backward: 62.33431384573302 [2025-04-04 14:18:04,792] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18808.24 | forward: 145673.45 | backward_microstep: 390486.21 | backward: 390472.14 | backward_inner_microstep: 390454.73 | backward_inner: 390447.88 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.86 | reduce_tied_grads: 0.29 | comms: 18.27 | reduce_grads: 0.19 | step: 358.56 | _step_clipping: 0.11 | _step_step: 356.68 | _step_zero_grad: 0.53 | _step_check_overflow: 0.61 samples/sec: 16.347 | iteration 14650/ 143000 | elapsed time per iteration (ms): 62642.2 | learning rate: 5.875E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.426437E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 14:28:35,012] [INFO] [logging.py:60:log_dist] [Rank 0] step=14660, skipped=12, lr=[0.0005874398035002099, 0.0005874398035002099], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14660 loss: 2.3887 iter time (s): 63.022 samples/sec: 16.248 %comms: 0.00293366659325804 %optimizer_step 0.05764080198186842 %forward: 23.10978089473756 %backward: 61.968280865088786 [2025-04-04 14:28:35,013] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22542.89 | forward: 145641.31 | backward_microstep: 390548.32 | backward: 390533.41 | backward_inner_microstep: 390515.74 | backward_inner: 390508.86 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.82 | reduce_tied_grads: 0.36 | comms: 18.49 | reduce_grads: 0.22 | step: 363.26 | _step_clipping: 0.13 | _step_step: 361.38 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.248 | iteration 14660/ 143000 | elapsed time per iteration (ms): 63022.1 | learning rate: 5.874E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.408552E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 14:39:08,128] [INFO] [logging.py:60:log_dist] [Rank 0] step=14670, skipped=12, lr=[0.0005874209256149604, 0.0005874209256149604], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14670 loss: 2.4184 iter time (s): 63.311 samples/sec: 16.174 %comms: 0.0033555798872818444 %optimizer_step 0.06169317186520615 %forward: 23.083991695011708 %backward: 61.70837700367907 [2025-04-04 14:39:08,129] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24725.35 | forward: 146147.16 | backward_microstep: 390699.38 | backward: 390682.16 | backward_inner_microstep: 390663.37 | backward_inner: 390655.89 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.00 | reduce_tied_grads: 0.36 | comms: 21.24 | reduce_grads: 0.25 | step: 390.59 | _step_clipping: 0.16 | _step_step: 388.30 | _step_zero_grad: 0.70 | _step_check_overflow: 0.72 samples/sec: 16.174 | iteration 14670/ 143000 | elapsed time per iteration (ms): 63311.6 | learning rate: 5.874E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.419378E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 14:49:38,071] [INFO] [logging.py:60:log_dist] [Rank 0] step=14680, skipped=12, lr=[0.0005874020338574882, 0.0005874020338574882], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14680 loss: 2.4019 iter time (s): 62.994 samples/sec: 16.256 %comms: 0.0028988971463673654 %optimizer_step 0.058655196273437654 %forward: 23.15760120275683 %backward: 62.01683995857328 [2025-04-04 14:49:38,071] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21874.16 | forward: 145878.05 | backward_microstep: 390686.73 | backward: 390666.35 | backward_inner_microstep: 390647.43 | backward_inner: 390640.01 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.21 | step: 369.49 | _step_clipping: 0.12 | _step_step: 367.67 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.255 | iteration 14680/ 143000 | elapsed time per iteration (ms): 62994.2 | learning rate: 5.874E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.418079E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 15:00:15,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=14690, skipped=12, lr=[0.0005873831282287051, 0.0005873831282287051], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14690 loss: 2.4041 iter time (s): 63.777 samples/sec: 16.056 %comms: 0.0028496146161185636 %optimizer_step 0.055792401465672756 %forward: 22.877266625585392 %backward: 61.26157135515934 [2025-04-04 15:00:15,845] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29654.59 | forward: 145903.91 | backward_microstep: 390726.71 | backward: 390706.76 | backward_inner_microstep: 390687.58 | backward_inner: 390679.99 | backward_allreduce_microstep: 8.99 | backward_allreduce: 3.13 | reduce_tied_grads: 0.29 | comms: 18.17 | reduce_grads: 0.21 | step: 355.83 | _step_clipping: 0.13 | _step_step: 354.03 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.056 | iteration 14690/ 143000 | elapsed time per iteration (ms): 63777.4 | learning rate: 5.874E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.415428E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 15:10:44,542] [INFO] [logging.py:60:log_dist] [Rank 0] step=14700, skipped=12, lr=[0.0005873642087295235, 0.0005873642087295235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14700 loss: 2.4200 iter time (s): 62.869 samples/sec: 16.288 %comms: 0.0029190444957590553 %optimizer_step 0.05738268159255668 %forward: 23.188188733736066 %backward: 62.15602410353105 [2025-04-04 15:10:44,543] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20617.20 | forward: 145782.24 | backward_microstep: 390789.09 | backward: 390769.82 | backward_inner_microstep: 390751.33 | backward_inner: 390743.65 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.35 | reduce_grads: 0.22 | step: 360.76 | _step_clipping: 0.13 | _step_step: 358.89 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.288 | iteration 14700/ 143000 | elapsed time per iteration (ms): 62869.8 | learning rate: 5.874E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.413478E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 15:21:22,668] [INFO] [logging.py:60:log_dist] [Rank 0] step=14710, skipped=12, lr=[0.0005873452753608567, 0.0005873452753608567], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14710 loss: 2.4191 iter time (s): 63.812 samples/sec: 16.047 %comms: 0.00288148171942893 %optimizer_step 0.05729871346343426 %forward: 22.876858757467303 %backward: 61.21325577269192 [2025-04-04 15:21:22,669] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30012.69 | forward: 145981.86 | backward_microstep: 390633.13 | backward: 390614.17 | backward_inner_microstep: 390595.48 | backward_inner: 390588.32 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.08 | reduce_tied_grads: 0.33 | comms: 18.39 | reduce_grads: 0.22 | step: 365.63 | _step_clipping: 0.15 | _step_step: 363.68 | _step_zero_grad: 0.53 | _step_check_overflow: 0.65 samples/sec: 16.047 | iteration 14710/ 143000 | elapsed time per iteration (ms): 63812.6 | learning rate: 5.873E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.415893E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 15:31:52,941] [INFO] [logging.py:60:log_dist] [Rank 0] step=14720, skipped=12, lr=[0.0005873263281236184, 0.0005873263281236184], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14720 loss: 2.4234 iter time (s): 63.027 samples/sec: 16.247 %comms: 0.0029865384346702364 %optimizer_step 0.06002859031590199 %forward: 23.184312245240566 %backward: 62.0285884033069 [2025-04-04 15:31:52,942] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21633.51 | forward: 146122.92 | backward_microstep: 390965.21 | backward: 390945.32 | backward_inner_microstep: 390925.23 | backward_inner: 390917.93 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.96 | reduce_tied_grads: 0.38 | comms: 18.82 | reduce_grads: 0.24 | step: 378.34 | _step_clipping: 0.14 | _step_step: 376.16 | _step_zero_grad: 0.65 | _step_check_overflow: 0.67 samples/sec: 16.247 | iteration 14720/ 143000 | elapsed time per iteration (ms): 63027.3 | learning rate: 5.873E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.417072E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 15:42:23,688] [INFO] [logging.py:60:log_dist] [Rank 0] step=14730, skipped=12, lr=[0.000587307367018723, 0.000587307367018723], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14730 loss: 2.4213 iter time (s): 63.074 samples/sec: 16.235 %comms: 0.0029131525516619887 %optimizer_step 0.05881719563809995 %forward: 23.179507228716957 %backward: 61.975334687975725 [2025-04-04 15:42:23,689] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22077.12 | forward: 146202.62 | backward_microstep: 390925.26 | backward: 390903.75 | backward_inner_microstep: 390884.96 | backward_inner: 390877.79 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.98 | reduce_tied_grads: 0.33 | comms: 18.37 | reduce_grads: 0.20 | step: 370.98 | _step_clipping: 0.11 | _step_step: 369.01 | _step_zero_grad: 0.62 | _step_check_overflow: 0.61 samples/sec: 16.235 | iteration 14730/ 143000 | elapsed time per iteration (ms): 63074.7 | learning rate: 5.873E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.426196E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 15:52:54,753] [INFO] [logging.py:60:log_dist] [Rank 0] step=14740, skipped=12, lr=[0.0005872883920470859, 0.0005872883920470859], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14740 loss: 2.4222 iter time (s): 63.106 samples/sec: 16.227 %comms: 0.004292835199454202 %optimizer_step 0.05844153442871663 %forward: 23.109390192409887 %backward: 61.88827377985163 [2025-04-04 15:52:54,754] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23143.48 | forward: 145833.82 | backward_microstep: 390567.59 | backward: 390551.35 | backward_inner_microstep: 390533.19 | backward_inner: 390526.31 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.94 | reduce_tied_grads: 0.32 | comms: 27.09 | reduce_grads: 0.21 | step: 368.80 | _step_clipping: 0.12 | _step_step: 366.89 | _step_zero_grad: 0.58 | _step_check_overflow: 0.59 samples/sec: 16.227 | iteration 14740/ 143000 | elapsed time per iteration (ms): 63106.5 | learning rate: 5.873E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.418950E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 16:03:27,333] [INFO] [logging.py:60:log_dist] [Rank 0] step=14750, skipped=12, lr=[0.0005872694032096227, 0.0005872694032096227], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14750 loss: 2.4111 iter time (s): 63.257 samples/sec: 16.188 %comms: 0.00293204451280703 %optimizer_step 0.06334543601389626 %forward: 23.035883890824717 %backward: 61.74611667367177 [2025-04-04 16:03:27,334] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24710.46 | forward: 145718.58 | backward_microstep: 390605.79 | backward: 390588.73 | backward_inner_microstep: 390569.10 | backward_inner: 390561.69 | backward_allreduce_microstep: 9.16 | backward_allreduce: 3.14 | reduce_tied_grads: 0.47 | comms: 18.55 | reduce_grads: 0.31 | step: 400.71 | _step_clipping: 0.19 | _step_step: 398.39 | _step_zero_grad: 0.67 | _step_check_overflow: 0.62 samples/sec: 16.188 | iteration 14750/ 143000 | elapsed time per iteration (ms): 63258.0 | learning rate: 5.873E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.418565E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 16:13:58,731] [INFO] [logging.py:60:log_dist] [Rank 0] step=14760, skipped=12, lr=[0.0005872504005072499, 0.0005872504005072499], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14760 loss: 2.4103 iter time (s): 63.139 samples/sec: 16.218 %comms: 0.0028525429467470918 %optimizer_step 0.0561605296135259 %forward: 23.041349155194023 %backward: 61.81320079503425 [2025-04-04 16:13:58,732] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24194.52 | forward: 145480.33 | backward_microstep: 390293.22 | backward: 390281.19 | backward_inner_microstep: 390262.65 | backward_inner: 390255.67 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.06 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.23 | step: 354.59 | _step_clipping: 0.13 | _step_step: 352.77 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.218 | iteration 14760/ 143000 | elapsed time per iteration (ms): 63139.8 | learning rate: 5.873E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.424121E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 16:24:35,900] [INFO] [logging.py:60:log_dist] [Rank 0] step=14770, skipped=12, lr=[0.0005872313839408847, 0.0005872313839408847], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14770 loss: 2.4081 iter time (s): 63.716 samples/sec: 16.071 %comms: 0.002901122041681773 %optimizer_step 0.06001162771031451 %forward: 22.8875795631298 %backward: 61.31171725863575 [2025-04-04 16:24:35,900] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29140.44 | forward: 145830.83 | backward_microstep: 390671.60 | backward: 390654.62 | backward_inner_microstep: 390635.92 | backward_inner: 390628.80 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.97 | reduce_tied_grads: 0.37 | comms: 18.48 | reduce_grads: 0.23 | step: 382.37 | _step_clipping: 0.16 | _step_step: 380.08 | _step_zero_grad: 0.66 | _step_check_overflow: 0.75 samples/sec: 16.071 | iteration 14770/ 143000 | elapsed time per iteration (ms): 63716.8 | learning rate: 5.872E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.408809E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 16:35:08,245] [INFO] [logging.py:60:log_dist] [Rank 0] step=14780, skipped=12, lr=[0.000587212353511445, 0.000587212353511445], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14780 loss: 2.4106 iter time (s): 63.234 samples/sec: 16.194 %comms: 0.0029457202879191887 %optimizer_step 0.05777739626099501 %forward: 23.05203998887589 %backward: 61.77689922089022 [2025-04-04 16:35:08,246] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24381.21 | forward: 145766.96 | backward_microstep: 390655.28 | backward: 390639.22 | backward_inner_microstep: 390620.66 | backward_inner: 390613.53 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.63 | reduce_grads: 0.22 | step: 365.35 | _step_clipping: 0.14 | _step_step: 363.35 | _step_zero_grad: 0.56 | _step_check_overflow: 0.64 samples/sec: 16.194 | iteration 14780/ 143000 | elapsed time per iteration (ms): 63234.5 | learning rate: 5.872E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.413362E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 16:45:51,603] [INFO] [logging.py:60:log_dist] [Rank 0] step=14790, skipped=12, lr=[0.0005871933092198491, 0.0005871933092198491], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14790 loss: 2.4253 iter time (s): 64.335 samples/sec: 15.917 %comms: 0.002828512870892947 %optimizer_step 0.05666968617688424 %forward: 22.67535727236984 %backward: 60.69724584401675 [2025-04-04 16:45:51,604] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35475.78 | forward: 145882.40 | backward_microstep: 390511.97 | backward: 390497.03 | backward_inner_microstep: 390478.06 | backward_inner: 390469.25 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.19 | reduce_tied_grads: 0.35 | comms: 18.20 | reduce_grads: 0.22 | step: 364.59 | _step_clipping: 0.15 | _step_step: 362.73 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 15.916 | iteration 14790/ 143000 | elapsed time per iteration (ms): 64335.8 | learning rate: 5.872E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.414749E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 16:56:25,409] [INFO] [logging.py:60:log_dist] [Rank 0] step=14800, skipped=12, lr=[0.0005871742510670163, 0.0005871742510670163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14800 loss: 2.4057 iter time (s): 63.380 samples/sec: 16.157 %comms: 0.002893975674539481 %optimizer_step 0.05801079793814758 %forward: 22.983099539801405 %backward: 61.61283558998615 [2025-04-04 16:56:25,410] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26128.09 | forward: 145666.89 | backward_microstep: 390517.65 | backward: 390502.16 | backward_inner_microstep: 390482.61 | backward_inner: 390475.33 | backward_allreduce_microstep: 9.27 | backward_allreduce: 3.29 | reduce_tied_grads: 0.35 | comms: 18.34 | reduce_grads: 0.23 | step: 367.67 | _step_clipping: 0.17 | _step_step: 365.74 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.156 | iteration 14800/ 143000 | elapsed time per iteration (ms): 63380.6 | learning rate: 5.872E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.422602E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 17:06:57,092] [INFO] [logging.py:60:log_dist] [Rank 0] step=14810, skipped=12, lr=[0.0005871551790538666, 0.0005871551790538666], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14810 loss: 2.4267 iter time (s): 63.168 samples/sec: 16.211 %comms: 0.002896306511353322 %optimizer_step 0.05791333508648902 %forward: 23.041310586003583 %backward: 61.803579478773244 [2025-04-04 17:06:57,092] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24221.97 | forward: 145546.55 | backward_microstep: 390413.99 | backward: 390398.70 | backward_inner_microstep: 390380.07 | backward_inner: 390373.01 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.01 | reduce_tied_grads: 0.38 | comms: 18.30 | reduce_grads: 0.22 | step: 365.82 | _step_clipping: 0.14 | _step_step: 363.81 | _step_zero_grad: 0.58 | _step_check_overflow: 0.65 samples/sec: 16.211 | iteration 14810/ 143000 | elapsed time per iteration (ms): 63168.3 | learning rate: 5.872E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.417512E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 17:17:33,053] [INFO] [logging.py:60:log_dist] [Rank 0] step=14820, skipped=12, lr=[0.0005871360931813201, 0.0005871360931813201], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14820 loss: 2.3986 iter time (s): 63.595 samples/sec: 16.102 %comms: 0.002952589225428509 %optimizer_step 0.05853168762417426 %forward: 22.94035648522414 %backward: 61.40695640428697 [2025-04-04 17:17:33,054] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28046.10 | forward: 145890.29 | backward_microstep: 390536.00 | backward: 390520.47 | backward_inner_microstep: 390500.48 | backward_inner: 390493.35 | backward_allreduce_microstep: 8.60 | backward_allreduce: 3.00 | reduce_tied_grads: 0.37 | comms: 18.78 | reduce_grads: 0.23 | step: 372.24 | _step_clipping: 0.15 | _step_step: 370.18 | _step_zero_grad: 0.61 | _step_check_overflow: 0.60 samples/sec: 16.102 | iteration 14820/ 143000 | elapsed time per iteration (ms): 63596.1 | learning rate: 5.871E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.412082E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 17:28:04,834] [INFO] [logging.py:60:log_dist] [Rank 0] step=14830, skipped=12, lr=[0.0005871169934502983, 0.0005871169934502983], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14830 loss: 2.3993 iter time (s): 63.177 samples/sec: 16.208 %comms: 0.0028753270309477197 %optimizer_step 0.056267038882138486 %forward: 23.02301649511967 %backward: 61.75611856597135 [2025-04-04 17:28:04,834] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24724.27 | forward: 145453.59 | backward_microstep: 390170.93 | backward: 390159.53 | backward_inner_microstep: 390141.42 | backward_inner: 390134.65 | backward_allreduce_microstep: 8.69 | backward_allreduce: 3.00 | reduce_tied_grads: 0.33 | comms: 18.17 | reduce_grads: 0.25 | step: 355.48 | _step_clipping: 0.14 | _step_step: 353.46 | _step_zero_grad: 0.56 | _step_check_overflow: 0.70 samples/sec: 16.208 | iteration 14830/ 143000 | elapsed time per iteration (ms): 63178.1 | learning rate: 5.871E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.418355E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 17:38:30,397] [INFO] [logging.py:60:log_dist] [Rank 0] step=14840, skipped=12, lr=[0.0005870978798617229, 0.0005870978798617229], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14840 loss: 2.4333 iter time (s): 62.556 samples/sec: 16.369 %comms: 0.002874860885965763 %optimizer_step 0.05794698880926994 %forward: 23.212426060258796 %backward: 62.35010192161251 [2025-04-04 17:38:30,398] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18901.78 | forward: 145207.12 | backward_microstep: 390043.95 | backward: 390035.86 | backward_inner_microstep: 390018.42 | backward_inner: 390011.82 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.86 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.20 | step: 362.49 | _step_clipping: 0.12 | _step_step: 360.64 | _step_zero_grad: 0.56 | _step_check_overflow: 0.58 samples/sec: 16.369 | iteration 14840/ 143000 | elapsed time per iteration (ms): 62556.4 | learning rate: 5.871E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.423336E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 17:48:57,364] [INFO] [logging.py:60:log_dist] [Rank 0] step=14850, skipped=12, lr=[0.0005870787524165165, 0.0005870787524165165], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14850 loss: 2.4208 iter time (s): 62.696 samples/sec: 16.333 %comms: 0.002876107816166038 %optimizer_step 0.0579465069569229 %forward: 23.162620784934596 %backward: 62.19885396283402 [2025-04-04 17:48:57,365] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20355.35 | forward: 145220.60 | backward_microstep: 389970.81 | backward: 389962.55 | backward_inner_microstep: 389943.37 | backward_inner: 389936.89 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.89 | reduce_tied_grads: 0.30 | comms: 18.03 | reduce_grads: 0.21 | step: 363.30 | _step_clipping: 0.12 | _step_step: 361.51 | _step_zero_grad: 0.54 | _step_check_overflow: 0.54 samples/sec: 16.333 | iteration 14850/ 143000 | elapsed time per iteration (ms): 62696.7 | learning rate: 5.871E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.416719E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 17:59:27,372] [INFO] [logging.py:60:log_dist] [Rank 0] step=14860, skipped=12, lr=[0.000587059611115602, 0.000587059611115602], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14860 loss: 2.4231 iter time (s): 63.000 samples/sec: 16.254 %comms: 0.002863777824960897 %optimizer_step 0.05729652213973739 %forward: 23.76545815455642 %backward: 61.93313386413539 [2025-04-04 17:59:27,372] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18668.03 | forward: 149722.79 | backward_microstep: 390190.45 | backward: 390179.80 | backward_inner_microstep: 390160.60 | backward_inner: 390154.02 | backward_allreduce_microstep: 10.08 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.20 | step: 360.97 | _step_clipping: 0.12 | _step_step: 359.26 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.254 | iteration 14860/ 143000 | elapsed time per iteration (ms): 63000.7 | learning rate: 5.871E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.413236E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 18:09:55,285] [INFO] [logging.py:60:log_dist] [Rank 0] step=14870, skipped=12, lr=[0.0005870404559599037, 0.0005870404559599037], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14870 loss: 2.3985 iter time (s): 62.791 samples/sec: 16.308 %comms: 0.002896908144791057 %optimizer_step 0.057422032237943114 %forward: 23.190740382859858 %backward: 62.18316053920226 [2025-04-04 18:09:55,286] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20335.39 | forward: 145616.43 | backward_microstep: 390467.25 | backward: 390452.81 | backward_inner_microstep: 390432.85 | backward_inner: 390425.63 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.96 | reduce_tied_grads: 0.28 | comms: 18.19 | reduce_grads: 0.23 | step: 360.56 | _step_clipping: 0.11 | _step_step: 358.84 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.308 | iteration 14870/ 143000 | elapsed time per iteration (ms): 62791.4 | learning rate: 5.870E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.415227E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 18:20:32,913] [INFO] [logging.py:60:log_dist] [Rank 0] step=14880, skipped=12, lr=[0.0005870212869503457, 0.0005870212869503457], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14880 loss: 2.4253 iter time (s): 63.762 samples/sec: 16.060 %comms: 0.0028597308483919496 %optimizer_step 0.058026474083217865 %forward: 22.81042006964464 %backward: 61.21100316278193 [2025-04-04 18:20:32,914] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30398.90 | forward: 145444.10 | backward_microstep: 390306.59 | backward: 390294.39 | backward_inner_microstep: 390276.39 | backward_inner: 390269.36 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.90 | reduce_tied_grads: 0.36 | comms: 18.23 | reduce_grads: 0.25 | step: 369.99 | _step_clipping: 0.15 | _step_step: 368.00 | _step_zero_grad: 0.60 | _step_check_overflow: 0.55 samples/sec: 16.060 | iteration 14880/ 143000 | elapsed time per iteration (ms): 63762.8 | learning rate: 5.870E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.415239E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 18:31:11,908] [INFO] [logging.py:60:log_dist] [Rank 0] step=14890, skipped=12, lr=[0.0005870021040878535, 0.0005870021040878535], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14890 loss: 2.4321 iter time (s): 63.899 samples/sec: 16.025 %comms: 0.0028406974280005574 %optimizer_step 0.05637337069427664 %forward: 22.76111403625139 %backward: 61.07185069687138 [2025-04-04 18:31:11,909] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31897.38 | forward: 145441.14 | backward_microstep: 390253.75 | backward: 390242.75 | backward_inner_microstep: 390224.77 | backward_inner: 390218.06 | backward_allreduce_microstep: 8.65 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 18.15 | reduce_grads: 0.23 | step: 360.22 | _step_clipping: 0.15 | _step_step: 358.33 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.025 | iteration 14890/ 143000 | elapsed time per iteration (ms): 63899.5 | learning rate: 5.870E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.419760E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 18:41:46,458] [INFO] [logging.py:60:log_dist] [Rank 0] step=14900, skipped=12, lr=[0.0005869829073733527, 0.0005869829073733527], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14900 loss: 2.4176 iter time (s): 63.454 samples/sec: 16.138 %comms: 0.0028432837550108163 %optimizer_step 0.05657632317552273 %forward: 22.954367691029468 %backward: 61.50194276653497 [2025-04-04 18:41:46,458] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27200.93 | forward: 145655.26 | backward_microstep: 390268.34 | backward: 390256.08 | backward_inner_microstep: 390237.38 | backward_inner: 390230.48 | backward_allreduce_microstep: 8.84 | backward_allreduce: 3.07 | reduce_tied_grads: 0.34 | comms: 18.04 | reduce_grads: 0.22 | step: 359.00 | _step_clipping: 0.13 | _step_step: 357.09 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.137 | iteration 14900/ 143000 | elapsed time per iteration (ms): 63454.9 | learning rate: 5.870E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.426543E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 18:52:12,454] [INFO] [logging.py:60:log_dist] [Rank 0] step=14910, skipped=12, lr=[0.0005869636968077698, 0.0005869636968077698], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14910 loss: 2.4109 iter time (s): 62.599 samples/sec: 16.358 %comms: 0.002902542029658111 %optimizer_step 0.05633446776886925 %forward: 23.231049084985422 %backward: 62.322612882891804 [2025-04-04 18:52:12,455] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18993.29 | forward: 145424.19 | backward_microstep: 390145.42 | backward: 390133.72 | backward_inner_microstep: 390114.04 | backward_inner: 390107.28 | backward_allreduce_microstep: 8.65 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.21 | step: 352.65 | _step_clipping: 0.12 | _step_step: 350.77 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.358 | iteration 14910/ 143000 | elapsed time per iteration (ms): 62599.7 | learning rate: 5.870E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.416990E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 19:02:47,546] [INFO] [logging.py:60:log_dist] [Rank 0] step=14920, skipped=12, lr=[0.0005869444723920322, 0.0005869444723920322], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14920 loss: 2.4223 iter time (s): 63.508 samples/sec: 16.124 %comms: 0.002941880413793282 %optimizer_step 0.05831634839191996 %forward: 22.98827885789819 %backward: 61.444998117962754 [2025-04-04 19:02:47,547] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27375.36 | forward: 145995.05 | backward_microstep: 390239.55 | backward: 390227.81 | backward_inner_microstep: 390209.76 | backward_inner: 390202.94 | backward_allreduce_microstep: 8.68 | backward_allreduce: 2.95 | reduce_tied_grads: 0.36 | comms: 18.68 | reduce_grads: 0.23 | step: 370.36 | _step_clipping: 0.16 | _step_step: 368.27 | _step_zero_grad: 0.62 | _step_check_overflow: 0.58 samples/sec: 16.124 | iteration 14920/ 143000 | elapsed time per iteration (ms): 63509.2 | learning rate: 5.869E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.423116E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 19:13:13,532] [INFO] [logging.py:60:log_dist] [Rank 0] step=14930, skipped=12, lr=[0.0005869252341270677, 0.0005869252341270677], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14930 loss: 2.4083 iter time (s): 62.598 samples/sec: 16.358 %comms: 0.0029013715372504245 %optimizer_step 0.05852519285982432 %forward: 23.262870082909235 %backward: 62.340933330016945 [2025-04-04 19:13:13,533] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18628.00 | forward: 145620.97 | backward_microstep: 390256.94 | backward: 390241.92 | backward_inner_microstep: 390223.47 | backward_inner: 390216.40 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.00 | reduce_tied_grads: 0.30 | comms: 18.16 | reduce_grads: 0.21 | step: 366.36 | _step_clipping: 0.12 | _step_step: 364.32 | _step_zero_grad: 0.59 | _step_check_overflow: 0.71 samples/sec: 16.358 | iteration 14930/ 143000 | elapsed time per iteration (ms): 62598.6 | learning rate: 5.869E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.414030E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 19:23:41,502] [INFO] [logging.py:60:log_dist] [Rank 0] step=14940, skipped=12, lr=[0.0005869059820138048, 0.0005869059820138048], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14940 loss: 2.3944 iter time (s): 62.796 samples/sec: 16.307 %comms: 0.002942853009963943 %optimizer_step 0.05785735354338021 %forward: 23.170794624502857 %backward: 62.157460946713485 [2025-04-04 19:23:41,503] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20632.62 | forward: 145504.29 | backward_microstep: 390340.78 | backward: 390326.58 | backward_inner_microstep: 390309.08 | backward_inner: 390302.41 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.94 | reduce_tied_grads: 0.33 | comms: 18.48 | reduce_grads: 0.20 | step: 363.32 | _step_clipping: 0.12 | _step_step: 361.40 | _step_zero_grad: 0.53 | _step_check_overflow: 0.65 samples/sec: 16.307 | iteration 14940/ 143000 | elapsed time per iteration (ms): 62797.0 | learning rate: 5.869E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.407932E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 19:34:11,101] [INFO] [logging.py:60:log_dist] [Rank 0] step=14950, skipped=12, lr=[0.0005868867160531727, 0.0005868867160531727], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14950 loss: 2.4061 iter time (s): 62.959 samples/sec: 16.265 %comms: 0.0028866969409549557 %optimizer_step 0.058495630855861604 %forward: 23.126955543194825 %backward: 62.03537582894675 [2025-04-04 19:34:11,101] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21862.03 | forward: 145605.44 | backward_microstep: 390585.49 | backward: 390569.70 | backward_inner_microstep: 390550.49 | backward_inner: 390543.29 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.07 | reduce_tied_grads: 0.35 | comms: 18.17 | reduce_grads: 0.21 | step: 368.28 | _step_clipping: 0.12 | _step_step: 366.41 | _step_zero_grad: 0.58 | _step_check_overflow: 0.55 samples/sec: 16.264 | iteration 14950/ 143000 | elapsed time per iteration (ms): 62959.8 | learning rate: 5.869E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.415691E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 19:44:39,489] [INFO] [logging.py:60:log_dist] [Rank 0] step=14960, skipped=12, lr=[0.0005868674362461012, 0.0005868674362461012], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14960 loss: 2.4049 iter time (s): 62.838 samples/sec: 16.296 %comms: 0.0028893318658978627 %optimizer_step 0.05884046640419678 %forward: 23.153369705275782 %backward: 62.14296730126334 [2025-04-04 19:44:39,490] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20827.70 | forward: 145491.69 | backward_microstep: 390511.68 | backward: 390495.44 | backward_inner_microstep: 390476.49 | backward_inner: 390469.31 | backward_allreduce_microstep: 8.97 | backward_allreduce: 3.08 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.21 | step: 369.74 | _step_clipping: 0.13 | _step_step: 367.87 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.296 | iteration 14960/ 143000 | elapsed time per iteration (ms): 62838.9 | learning rate: 5.869E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.411238E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 19:55:16,697] [INFO] [logging.py:60:log_dist] [Rank 0] step=14970, skipped=12, lr=[0.0005868481425935208, 0.0005868481425935208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14970 loss: 2.4130 iter time (s): 63.720 samples/sec: 16.070 %comms: 0.0028487038550679936 %optimizer_step 0.05782350607512691 %forward: 22.87417106174165 %backward: 61.283597228718776 [2025-04-04 19:55:16,698] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29377.31 | forward: 145754.68 | backward_microstep: 390514.74 | backward: 390500.32 | backward_inner_microstep: 390481.32 | backward_inner: 390473.83 | backward_allreduce_microstep: 8.98 | backward_allreduce: 3.07 | reduce_tied_grads: 0.36 | comms: 18.15 | reduce_grads: 0.23 | step: 368.45 | _step_clipping: 0.14 | _step_step: 366.53 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 16.070 | iteration 14970/ 143000 | elapsed time per iteration (ms): 63720.8 | learning rate: 5.868E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.411094E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 20:05:56,371] [INFO] [logging.py:60:log_dist] [Rank 0] step=14980, skipped=12, lr=[0.000586828835096363, 0.000586828835096363], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14980 loss: 2.4224 iter time (s): 63.967 samples/sec: 16.008 %comms: 0.0029285192201102587 %optimizer_step 0.058897075814612634 %forward: 22.8265120349678 %backward: 61.104397707155556 [2025-04-04 20:05:56,372] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31217.78 | forward: 146013.78 | backward_microstep: 390882.25 | backward: 390864.98 | backward_inner_microstep: 390845.37 | backward_inner: 390838.05 | backward_allreduce_microstep: 9.25 | backward_allreduce: 3.16 | reduce_tied_grads: 0.38 | comms: 18.73 | reduce_grads: 0.24 | step: 376.75 | _step_clipping: 0.13 | _step_step: 374.52 | _step_zero_grad: 0.62 | _step_check_overflow: 0.78 samples/sec: 16.008 | iteration 14980/ 143000 | elapsed time per iteration (ms): 63967.4 | learning rate: 5.868E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.412775E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 20:16:34,514] [INFO] [logging.py:60:log_dist] [Rank 0] step=14990, skipped=12, lr=[0.0005868095137555592, 0.0005868095137555592], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 14990 loss: 2.3912 iter time (s): 63.814 samples/sec: 16.047 %comms: 0.0028766631122164677 %optimizer_step 0.05936342449579924 %forward: 22.892142552431345 %backward: 61.270235556385565 [2025-04-04 20:16:34,515] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29472.23 | forward: 146083.13 | backward_microstep: 391005.98 | backward: 390987.76 | backward_inner_microstep: 390967.58 | backward_inner: 390959.76 | backward_allreduce_microstep: 9.56 | backward_allreduce: 3.29 | reduce_tied_grads: 0.40 | comms: 18.36 | reduce_grads: 0.25 | step: 378.82 | _step_clipping: 0.15 | _step_step: 376.70 | _step_zero_grad: 0.62 | _step_check_overflow: 0.65 samples/sec: 16.047 | iteration 14990/ 143000 | elapsed time per iteration (ms): 63814.3 | learning rate: 5.868E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.412427E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 20:27:16,824] [INFO] [logging.py:60:log_dist] [Rank 0] step=15000, skipped=12, lr=[0.0005867901785720423, 0.0005867901785720423], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15000 loss: 2.4022 iter time (s): 64.230 samples/sec: 15.943 %comms: 0.0028903680088359835 %optimizer_step 0.06008384931350293 %forward: 22.781337256311083 %backward: 60.91414587755083 [2025-04-04 20:27:16,825] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33056.07 | forward: 146325.36 | backward_microstep: 391276.60 | backward: 391253.79 | backward_inner_microstep: 391232.94 | backward_inner: 391225.07 | backward_allreduce_microstep: 9.74 | backward_allreduce: 3.30 | reduce_tied_grads: 0.36 | comms: 18.56 | reduce_grads: 0.22 | step: 385.92 | _step_clipping: 0.13 | _step_step: 383.84 | _step_zero_grad: 0.58 | _step_check_overflow: 0.68 samples/sec: 15.942 | iteration 15000/ 143000 | elapsed time per iteration (ms): 64231.0 | learning rate: 5.868E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.406195E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 20:27:20,003] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step15000/mp_rank_00_model_states.pt [2025-04-04 20:27:34,326] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-04 20:27:34,331] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step15000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-04 20:38:08,263] [INFO] [logging.py:60:log_dist] [Rank 0] step=15010, skipped=12, lr=[0.0005867708295467453, 0.0005867708295467453], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15010 loss: 2.3936 iter time (s): 63.392 samples/sec: 16.154 %comms: 0.002892085349857138 %optimizer_step 0.057793377699460614 %forward: 23.028330168309058 %backward: 61.66454654562747 [2025-04-04 20:38:08,264] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25419.95 | forward: 145980.61 | backward_microstep: 390921.64 | backward: 390902.35 | backward_inner_microstep: 390882.01 | backward_inner: 390874.33 | backward_allreduce_microstep: 9.58 | backward_allreduce: 3.31 | reduce_tied_grads: 0.38 | comms: 18.33 | reduce_grads: 0.25 | step: 366.36 | _step_clipping: 0.12 | _step_step: 364.39 | _step_zero_grad: 0.56 | _step_check_overflow: 0.64 samples/sec: 15.719 | iteration 15010/ 143000 | elapsed time per iteration (ms): 65143.9 | learning rate: 5.868E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.414383E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 20:48:43,468] [INFO] [logging.py:60:log_dist] [Rank 0] step=15020, skipped=12, lr=[0.0005867514666806022, 0.0005867514666806022], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15020 loss: 2.4142 iter time (s): 63.520 samples/sec: 16.121 %comms: 0.003116380099476135 %optimizer_step 0.05905642945469133 %forward: 22.953795081119793 %backward: 61.48584466718741 [2025-04-04 20:48:43,468] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27258.46 | forward: 145802.01 | backward_microstep: 390570.26 | backward: 390556.76 | backward_inner_microstep: 390537.75 | backward_inner: 390530.42 | backward_allreduce_microstep: 9.11 | backward_allreduce: 3.12 | reduce_tied_grads: 0.43 | comms: 19.80 | reduce_grads: 0.24 | step: 375.13 | _step_clipping: 0.15 | _step_step: 372.92 | _step_zero_grad: 0.63 | _step_check_overflow: 0.70 samples/sec: 16.121 | iteration 15020/ 143000 | elapsed time per iteration (ms): 63520.5 | learning rate: 5.868E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.414385E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 20:59:12,873] [INFO] [logging.py:60:log_dist] [Rank 0] step=15030, skipped=12, lr=[0.0005867320899745474, 0.0005867320899745474], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15030 loss: 2.4257 iter time (s): 62.940 samples/sec: 16.269 %comms: 0.003015502327526415 %optimizer_step 0.05801211149921641 %forward: 23.15849198100708 %backward: 62.08021638177386 [2025-04-04 20:59:12,874] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21275.67 | forward: 145759.38 | backward_microstep: 390751.34 | backward: 390732.42 | backward_inner_microstep: 390713.62 | backward_inner: 390706.39 | backward_allreduce_microstep: 8.80 | backward_allreduce: 3.02 | reduce_tied_grads: 0.38 | comms: 18.98 | reduce_grads: 0.23 | step: 365.13 | _step_clipping: 0.13 | _step_step: 363.03 | _step_zero_grad: 0.58 | _step_check_overflow: 0.71 samples/sec: 16.269 | iteration 15030/ 143000 | elapsed time per iteration (ms): 62940.6 | learning rate: 5.867E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.418338E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 21:09:45,234] [INFO] [logging.py:60:log_dist] [Rank 0] step=15040, skipped=12, lr=[0.0005867126994295163, 0.0005867126994295163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15040 loss: 2.4207 iter time (s): 63.235 samples/sec: 16.193 %comms: 0.0028908668905743124 %optimizer_step 0.058286631446707815 %forward: 23.050592960633175 %backward: 61.746124321528164 [2025-04-04 21:09:45,234] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24540.03 | forward: 145761.30 | backward_microstep: 390467.93 | backward: 390453.95 | backward_inner_microstep: 390435.20 | backward_inner: 390428.15 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.05 | reduce_tied_grads: 0.35 | comms: 18.28 | reduce_grads: 0.22 | step: 368.58 | _step_clipping: 0.14 | _step_step: 366.42 | _step_zero_grad: 0.59 | _step_check_overflow: 0.78 samples/sec: 16.193 | iteration 15040/ 143000 | elapsed time per iteration (ms): 63236.0 | learning rate: 5.867E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.424675E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 21:20:15,544] [INFO] [logging.py:60:log_dist] [Rank 0] step=15050, skipped=12, lr=[0.0005866932950464446, 0.0005866932950464446], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15050 loss: 2.4226 iter time (s): 63.030 samples/sec: 16.246 %comms: 0.0029010578980822968 %optimizer_step 0.05670167686563179 %forward: 23.13525140743257 %backward: 61.93708133733495 [2025-04-04 21:20:15,545] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22534.98 | forward: 145822.65 | backward_microstep: 390409.15 | backward: 390392.52 | backward_inner_microstep: 390373.69 | backward_inner: 390366.70 | backward_allreduce_microstep: 8.96 | backward_allreduce: 3.09 | reduce_tied_grads: 0.34 | comms: 18.29 | reduce_grads: 0.22 | step: 357.39 | _step_clipping: 0.14 | _step_step: 355.48 | _step_zero_grad: 0.51 | _step_check_overflow: 0.66 samples/sec: 16.246 | iteration 15050/ 143000 | elapsed time per iteration (ms): 63031.1 | learning rate: 5.867E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.420968E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 21:30:49,418] [INFO] [logging.py:60:log_dist] [Rank 0] step=15060, skipped=12, lr=[0.0005866738768262689, 0.0005866738768262689], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15060 loss: 2.4226 iter time (s): 63.387 samples/sec: 16.155 %comms: 0.00286997068156554 %optimizer_step 0.05739347072853384 %forward: 22.966403185953165 %backward: 61.6055947729486 [2025-04-04 21:30:49,419] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26186.24 | forward: 145576.59 | backward_microstep: 390510.96 | backward: 390497.91 | backward_inner_microstep: 390479.37 | backward_inner: 390472.41 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.09 | reduce_tied_grads: 0.34 | comms: 18.19 | reduce_grads: 0.21 | step: 363.80 | _step_clipping: 0.12 | _step_step: 361.85 | _step_zero_grad: 0.60 | _step_check_overflow: 0.60 samples/sec: 16.155 | iteration 15060/ 143000 | elapsed time per iteration (ms): 63387.4 | learning rate: 5.867E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.417067E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 21:41:19,757] [INFO] [logging.py:60:log_dist] [Rank 0] step=15070, skipped=12, lr=[0.0005866544447699265, 0.0005866544447699265], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15070 loss: 2.3996 iter time (s): 63.033 samples/sec: 16.245 %comms: 0.0029148852508201267 %optimizer_step 0.056844649999851275 %forward: 23.09749966480938 %backward: 61.92803028791209 [2025-04-04 21:41:19,758] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22829.02 | forward: 145591.21 | backward_microstep: 390364.53 | backward: 390352.93 | backward_inner_microstep: 390335.11 | backward_inner: 390328.30 | backward_allreduce_microstep: 8.69 | backward_allreduce: 3.23 | reduce_tied_grads: 0.36 | comms: 18.37 | reduce_grads: 0.23 | step: 358.31 | _step_clipping: 0.12 | _step_step: 356.46 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.245 | iteration 15070/ 143000 | elapsed time per iteration (ms): 63033.9 | learning rate: 5.867E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.407511E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 21:51:49,788] [INFO] [logging.py:60:log_dist] [Rank 0] step=15080, skipped=12, lr=[0.0005866349988783552, 0.0005866349988783552], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15080 loss: 2.4042 iter time (s): 63.003 samples/sec: 16.253 %comms: 0.0029041616231142435 %optimizer_step 0.05743430599976499 %forward: 23.09933231911617 %backward: 61.94587493211149 [2025-04-04 21:51:49,789] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22611.71 | forward: 145531.67 | backward_microstep: 390287.64 | backward: 390274.77 | backward_inner_microstep: 390257.18 | backward_inner: 390250.52 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.85 | reduce_tied_grads: 0.33 | comms: 18.30 | reduce_grads: 0.20 | step: 361.85 | _step_clipping: 0.14 | _step_step: 359.92 | _step_zero_grad: 0.50 | _step_check_overflow: 0.70 samples/sec: 16.253 | iteration 15080/ 143000 | elapsed time per iteration (ms): 63003.1 | learning rate: 5.866E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.415345E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:02:14,394] [INFO] [logging.py:60:log_dist] [Rank 0] step=15090, skipped=12, lr=[0.0005866155391524936, 0.0005866155391524936], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15090 loss: 2.4245 iter time (s): 62.460 samples/sec: 16.394 %comms: 0.002911104712962909 %optimizer_step 0.05906262879428707 %forward: 23.351743201424878 %backward: 62.50258997337889 [2025-04-04 22:02:14,395] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16736.19 | forward: 145854.94 | backward_microstep: 390402.70 | backward: 390391.06 | backward_inner_microstep: 390373.65 | backward_inner: 390367.08 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.83 | reduce_tied_grads: 0.35 | comms: 18.18 | reduce_grads: 0.22 | step: 368.91 | _step_clipping: 0.13 | _step_step: 367.04 | _step_zero_grad: 0.50 | _step_check_overflow: 0.61 samples/sec: 16.394 | iteration 15090/ 143000 | elapsed time per iteration (ms): 62460.6 | learning rate: 5.866E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.408566E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:12:43,540] [INFO] [logging.py:60:log_dist] [Rank 0] step=15100, skipped=12, lr=[0.0005865960655932806, 0.0005865960655932806], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15100 loss: 2.4008 iter time (s): 62.914 samples/sec: 16.276 %comms: 0.002920074802543068 %optimizer_step 0.05932494531162794 %forward: 23.143977035787476 %backward: 62.09430218982107 [2025-04-04 22:12:43,541] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21197.91 | forward: 145607.90 | backward_microstep: 390676.40 | backward: 390659.77 | backward_inner_microstep: 390641.09 | backward_inner: 390633.82 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.04 | reduce_tied_grads: 0.35 | comms: 18.37 | reduce_grads: 0.21 | step: 373.24 | _step_clipping: 0.17 | _step_step: 371.06 | _step_zero_grad: 0.58 | _step_check_overflow: 0.75 samples/sec: 16.276 | iteration 15100/ 143000 | elapsed time per iteration (ms): 62914.6 | learning rate: 5.866E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.404979E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:23:15,072] [INFO] [logging.py:60:log_dist] [Rank 0] step=15110, skipped=12, lr=[0.0005865765782016565, 0.0005865765782016565], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15110 loss: 2.4186 iter time (s): 63.153 samples/sec: 16.215 %comms: 0.0028954521803659285 %optimizer_step 0.05735985340107588 %forward: 23.073332566369785 %backward: 61.82605208995828 [2025-04-04 22:23:15,072] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23688.96 | forward: 145713.93 | backward_microstep: 390461.54 | backward: 390447.16 | backward_inner_microstep: 390427.14 | backward_inner: 390420.23 | backward_allreduce_microstep: 8.78 | backward_allreduce: 2.97 | reduce_tied_grads: 0.36 | comms: 18.29 | reduce_grads: 0.40 | step: 362.24 | _step_clipping: 0.13 | _step_step: 360.37 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.215 | iteration 15110/ 143000 | elapsed time per iteration (ms): 63153.2 | learning rate: 5.866E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.418016E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:33:49,056] [INFO] [logging.py:60:log_dist] [Rank 0] step=15120, skipped=12, lr=[0.0005865570769785617, 0.0005865570769785617], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15120 loss: 2.4312 iter time (s): 63.398 samples/sec: 16.152 %comms: 0.0028817679140605173 %optimizer_step 0.05655420172479794 %forward: 22.975603939415134 %backward: 61.61071016662508 [2025-04-04 22:33:49,057] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26054.04 | forward: 145660.30 | backward_microstep: 390614.28 | backward: 390598.41 | backward_inner_microstep: 390580.26 | backward_inner: 390573.28 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.89 | reduce_tied_grads: 0.36 | comms: 18.27 | reduce_grads: 0.21 | step: 358.54 | _step_clipping: 0.13 | _step_step: 356.64 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.152 | iteration 15120/ 143000 | elapsed time per iteration (ms): 63398.4 | learning rate: 5.866E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.423344E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:44:17,944] [INFO] [logging.py:60:log_dist] [Rank 0] step=15130, skipped=12, lr=[0.0005865375619249374, 0.0005865375619249374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15130 loss: 2.4242 iter time (s): 62.888 samples/sec: 16.283 %comms: 0.0029022387153605435 %optimizer_step 0.057276952788757024 %forward: 23.138501972506297 %backward: 62.05948821203767 [2025-04-04 22:44:17,944] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21508.13 | forward: 145513.88 | backward_microstep: 390292.90 | backward: 390280.96 | backward_inner_microstep: 390264.04 | backward_inner: 390257.51 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 18.25 | reduce_grads: 0.21 | step: 360.20 | _step_clipping: 0.13 | _step_step: 358.38 | _step_zero_grad: 0.54 | _step_check_overflow: 0.53 samples/sec: 16.283 | iteration 15130/ 143000 | elapsed time per iteration (ms): 62888.8 | learning rate: 5.865E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.426956E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:54:48,193] [INFO] [logging.py:60:log_dist] [Rank 0] step=15140, skipped=12, lr=[0.0005865180330417255, 0.0005865180330417255], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15140 loss: 2.4181 iter time (s): 63.024 samples/sec: 16.248 %comms: 0.0028862505637279063 %optimizer_step 0.059752074950331385 %forward: 23.11676860507506 %backward: 61.95584021505638 [2025-04-04 22:54:48,193] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22394.31 | forward: 145691.75 | backward_microstep: 390486.48 | backward: 390472.17 | backward_inner_microstep: 390450.61 | backward_inner: 390443.63 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.94 | reduce_tied_grads: 0.35 | comms: 18.19 | reduce_grads: 0.22 | step: 376.58 | _step_clipping: 0.14 | _step_step: 374.66 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 16.248 | iteration 15140/ 143000 | elapsed time per iteration (ms): 63024.9 | learning rate: 5.865E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.417370E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 22:58:58,719] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-04 23:00:01,227] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-04 23:05:13,926] [INFO] [logging.py:60:log_dist] [Rank 0] step=15150, skipped=14, lr=[0.0005865023999784862, 0.0005865023999784862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15150 loss: 2.4385 iter time (s): 62.573 samples/sec: 16.365 %comms: 0.002334052981443519 %optimizer_step 0.048469292394515244 %forward: 23.26208003690993 %backward: 62.4089326641977 [2025-04-04 23:05:13,927] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18021.10 | forward: 145557.19 | backward_microstep: 390523.40 | backward: 390509.73 | backward_inner_microstep: 390491.68 | backward_inner: 390484.77 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.91 | reduce_tied_grads: 0.35 | comms: 14.60 | reduce_grads: 0.20 | step: 303.29 | _step_clipping: 0.12 | _step_step: 301.33 | _step_zero_grad: 0.53 | _step_check_overflow: 0.69 samples/sec: 16.365 | iteration 15150/ 143000 | elapsed time per iteration (ms): 62573.4 | learning rate: 5.865E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.432449E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-04 23:15:43,138] [INFO] [logging.py:60:log_dist] [Rank 0] step=15160, skipped=14, lr=[0.0005864828462043923, 0.0005864828462043923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15160 loss: 2.4233 iter time (s): 62.921 samples/sec: 16.274 %comms: 0.0028739195034597053 %optimizer_step 0.056702816407162154 %forward: 23.124544169539206 %backward: 62.04582157400903 [2025-04-04 23:15:43,139] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21723.53 | forward: 145500.88 | backward_microstep: 390410.59 | backward: 390395.67 | backward_inner_microstep: 390377.57 | backward_inner: 390370.43 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.95 | reduce_tied_grads: 0.36 | comms: 18.08 | reduce_grads: 0.23 | step: 356.78 | _step_clipping: 0.14 | _step_step: 354.86 | _step_zero_grad: 0.57 | _step_check_overflow: 0.57 samples/sec: 16.274 | iteration 15160/ 143000 | elapsed time per iteration (ms): 62921.2 | learning rate: 5.865E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.427755E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 23:26:17,517] [INFO] [logging.py:60:log_dist] [Rank 0] step=15170, skipped=14, lr=[0.0005864632786033515, 0.0005864632786033515], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15170 loss: 2.3904 iter time (s): 63.437 samples/sec: 16.142 %comms: 0.0028679085324974504 %optimizer_step 0.05731194315748463 %forward: 22.924978070711116 %backward: 61.50868219545054 [2025-04-04 23:26:17,518] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27155.21 | forward: 145429.93 | backward_microstep: 390205.11 | backward: 390194.63 | backward_inner_microstep: 390175.76 | backward_inner: 390168.76 | backward_allreduce_microstep: 9.15 | backward_allreduce: 3.10 | reduce_tied_grads: 0.35 | comms: 18.19 | reduce_grads: 0.23 | step: 363.57 | _step_clipping: 0.14 | _step_step: 361.67 | _step_zero_grad: 0.58 | _step_check_overflow: 0.55 samples/sec: 16.142 | iteration 15170/ 143000 | elapsed time per iteration (ms): 63437.9 | learning rate: 5.865E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.409848E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 23:36:47,117] [INFO] [logging.py:60:log_dist] [Rank 0] step=15180, skipped=14, lr=[0.0005864436971763084, 0.0005864436971763084], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15180 loss: 2.4146 iter time (s): 62.959 samples/sec: 16.264 %comms: 0.0029546671367353342 %optimizer_step 0.05860665425370952 %forward: 23.116111604759226 %backward: 62.00103273159873 [2025-04-04 23:36:47,117] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22083.08 | forward: 145537.37 | backward_microstep: 390369.34 | backward: 390354.03 | backward_inner_microstep: 390334.33 | backward_inner: 390327.21 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.88 | reduce_tied_grads: 0.36 | comms: 18.60 | reduce_grads: 0.23 | step: 368.98 | _step_clipping: 0.15 | _step_step: 366.93 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 16.264 | iteration 15180/ 143000 | elapsed time per iteration (ms): 62959.9 | learning rate: 5.864E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.423141E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 23:47:22,793] [INFO] [logging.py:60:log_dist] [Rank 0] step=15190, skipped=14, lr=[0.000586424101924208, 0.000586424101924208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15190 loss: 2.4048 iter time (s): 63.567 samples/sec: 16.109 %comms: 0.0029060505323446986 %optimizer_step 0.057210874972110776 %forward: 22.965195173702718 %backward: 61.44923075879238 [2025-04-04 23:47:22,794] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27387.77 | forward: 145983.00 | backward_microstep: 390633.45 | backward: 390614.72 | backward_inner_microstep: 390596.30 | backward_inner: 390589.07 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.02 | reduce_tied_grads: 0.34 | comms: 18.47 | reduce_grads: 0.21 | step: 363.67 | _step_clipping: 0.12 | _step_step: 361.49 | _step_zero_grad: 0.64 | _step_check_overflow: 0.77 samples/sec: 16.109 | iteration 15190/ 143000 | elapsed time per iteration (ms): 63567.7 | learning rate: 5.864E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.411870E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-04 23:57:59,291] [INFO] [logging.py:60:log_dist] [Rank 0] step=15200, skipped=14, lr=[0.000586404492847996, 0.000586404492847996], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15200 loss: 2.4385 iter time (s): 63.649 samples/sec: 16.088 %comms: 0.002908525817575662 %optimizer_step 0.0579995191416116 %forward: 22.906848981102467 %backward: 61.37922086001204 [2025-04-04 23:57:59,291] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28346.31 | forward: 145799.89 | backward_microstep: 390689.76 | backward: 390672.84 | backward_inner_microstep: 390653.34 | backward_inner: 390646.20 | backward_allreduce_microstep: 10.05 | backward_allreduce: 4.63 | reduce_tied_grads: 0.35 | comms: 18.51 | reduce_grads: 0.22 | step: 369.16 | _step_clipping: 0.16 | _step_step: 367.19 | _step_zero_grad: 0.58 | _step_check_overflow: 0.55 samples/sec: 16.088 | iteration 15200/ 143000 | elapsed time per iteration (ms): 63649.7 | learning rate: 5.864E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.412118E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 00:08:24,786] [INFO] [logging.py:60:log_dist] [Rank 0] step=15210, skipped=14, lr=[0.0005863848699486191, 0.0005863848699486191], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15210 loss: 2.3942 iter time (s): 62.549 samples/sec: 16.371 %comms: 0.002962309033792922 %optimizer_step 0.059050182515225313 %forward: 23.273444655795835 %backward: 62.41995975620311 [2025-04-05 00:08:24,787] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17879.34 | forward: 145573.00 | backward_microstep: 390446.39 | backward: 390430.43 | backward_inner_microstep: 390410.86 | backward_inner: 390403.82 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.86 | reduce_tied_grads: 0.34 | comms: 18.53 | reduce_grads: 0.22 | step: 369.35 | _step_clipping: 0.13 | _step_step: 367.33 | _step_zero_grad: 0.55 | _step_check_overflow: 0.68 samples/sec: 16.371 | iteration 15210/ 143000 | elapsed time per iteration (ms): 62549.6 | learning rate: 5.864E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.408120E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 00:18:59,461] [INFO] [logging.py:60:log_dist] [Rank 0] step=15220, skipped=14, lr=[0.000586365233227024, 0.000586365233227024], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15220 loss: 2.3990 iter time (s): 63.467 samples/sec: 16.134 %comms: 0.0028924963672391766 %optimizer_step 0.057189181369519636 %forward: 22.92524472440573 %backward: 61.50045381648162 [2025-04-05 00:18:59,461] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27267.71 | forward: 145499.23 | backward_microstep: 390336.75 | backward: 390323.80 | backward_inner_microstep: 390305.91 | backward_inner: 390299.02 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.97 | reduce_tied_grads: 0.36 | comms: 18.36 | reduce_grads: 0.21 | step: 362.96 | _step_clipping: 0.14 | _step_step: 360.94 | _step_zero_grad: 0.59 | _step_check_overflow: 0.65 samples/sec: 16.134 | iteration 15220/ 143000 | elapsed time per iteration (ms): 63467.4 | learning rate: 5.864E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.406788E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 00:29:31,229] [INFO] [logging.py:60:log_dist] [Rank 0] step=15230, skipped=14, lr=[0.0005863455826841588, 0.0005863455826841588], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15230 loss: 2.4224 iter time (s): 63.176 samples/sec: 16.209 %comms: 0.0028848923245187465 %optimizer_step 0.05712625710275575 %forward: 23.062434947317826 %backward: 61.813824573541055 [2025-04-05 00:29:31,230] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23947.45 | forward: 145699.83 | backward_microstep: 390531.26 | backward: 390516.61 | backward_inner_microstep: 390498.58 | backward_inner: 390489.87 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.92 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.22 | step: 360.90 | _step_clipping: 0.12 | _step_step: 358.95 | _step_zero_grad: 0.55 | _step_check_overflow: 0.67 samples/sec: 16.208 | iteration 15230/ 143000 | elapsed time per iteration (ms): 63176.9 | learning rate: 5.863E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.412783E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 00:40:04,962] [INFO] [logging.py:60:log_dist] [Rank 0] step=15240, skipped=14, lr=[0.0005863259183209717, 0.0005863259183209717], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15240 loss: 2.4273 iter time (s): 63.373 samples/sec: 16.158 %comms: 0.0028701972101852417 %optimizer_step 0.057698785476413564 %forward: 22.978848299658488 %backward: 61.57755597593635 [2025-04-05 00:40:04,962] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26295.75 | forward: 145622.98 | backward_microstep: 390244.22 | backward: 390233.11 | backward_inner_microstep: 390215.19 | backward_inner: 390208.34 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.87 | reduce_tied_grads: 0.37 | comms: 18.19 | reduce_grads: 0.22 | step: 365.65 | _step_clipping: 0.14 | _step_step: 359.75 | _step_zero_grad: 4.57 | _step_check_overflow: 0.56 samples/sec: 16.158 | iteration 15240/ 143000 | elapsed time per iteration (ms): 63373.2 | learning rate: 5.863E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.410044E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 00:50:33,648] [INFO] [logging.py:60:log_dist] [Rank 0] step=15250, skipped=14, lr=[0.0005863062401384117, 0.0005863062401384117], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15250 loss: 2.4050 iter time (s): 62.868 samples/sec: 16.288 %comms: 0.0028589482459385838 %optimizer_step 0.05849483882591186 %forward: 23.18044604942143 %backward: 62.075141843654706 [2025-04-05 00:50:33,649] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21110.06 | forward: 145731.04 | backward_microstep: 390265.50 | backward: 390254.56 | backward_inner_microstep: 390237.00 | backward_inner: 390230.38 | backward_allreduce_microstep: 8.45 | backward_allreduce: 3.02 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.19 | step: 367.75 | _step_clipping: 0.14 | _step_step: 365.93 | _step_zero_grad: 0.49 | _step_check_overflow: 0.62 samples/sec: 16.288 | iteration 15250/ 143000 | elapsed time per iteration (ms): 62868.7 | learning rate: 5.863E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.410939E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 01:01:01,925] [INFO] [logging.py:60:log_dist] [Rank 0] step=15260, skipped=14, lr=[0.000586286548137429, 0.000586286548137429], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15260 loss: 2.4106 iter time (s): 62.827 samples/sec: 16.299 %comms: 0.002879559382885815 %optimizer_step 0.055517879097154804 %forward: 23.14705316407866 %backward: 62.108958508317244 [2025-04-05 01:01:01,926] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21072.13 | forward: 145426.25 | backward_microstep: 390222.22 | backward: 390212.65 | backward_inner_microstep: 390195.55 | backward_inner: 390188.99 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.87 | reduce_tied_grads: 0.29 | comms: 18.09 | reduce_grads: 0.20 | step: 348.80 | _step_clipping: 0.13 | _step_step: 347.06 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.299 | iteration 15260/ 143000 | elapsed time per iteration (ms): 62827.7 | learning rate: 5.863E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.413748E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 01:11:33,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=15270, skipped=14, lr=[0.0005862668423189736, 0.0005862668423189736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15270 loss: 2.3892 iter time (s): 63.145 samples/sec: 16.217 %comms: 0.0028829977032242896 %optimizer_step 0.05624167596287997 %forward: 23.060447994003113 %backward: 61.83190909492458 [2025-04-05 01:11:33,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23779.13 | forward: 145615.20 | backward_microstep: 390451.33 | backward: 390437.59 | backward_inner_microstep: 390420.11 | backward_inner: 390413.38 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.20 | reduce_grads: 0.21 | step: 355.14 | _step_clipping: 0.12 | _step_step: 353.31 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.216 | iteration 15270/ 143000 | elapsed time per iteration (ms): 63145.6 | learning rate: 5.863E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.401048E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 01:22:07,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=15280, skipped=14, lr=[0.0005862471226839966, 0.0005862471226839966], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15280 loss: 2.4149 iter time (s): 63.362 samples/sec: 16.161 %comms: 0.0029031656852155606 %optimizer_step 0.058039457429271336 %forward: 22.997638031975598 %backward: 61.632546362870976 [2025-04-05 01:22:07,005] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25734.89 | forward: 145716.91 | backward_microstep: 390530.91 | backward: 390514.20 | backward_inner_microstep: 390495.50 | backward_inner: 390488.37 | backward_allreduce_microstep: 8.76 | backward_allreduce: 2.97 | reduce_tied_grads: 0.39 | comms: 18.39 | reduce_grads: 0.21 | step: 367.75 | _step_clipping: 0.15 | _step_step: 365.48 | _step_zero_grad: 0.61 | _step_check_overflow: 0.80 samples/sec: 16.161 | iteration 15280/ 143000 | elapsed time per iteration (ms): 63362.3 | learning rate: 5.862E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.415282E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 01:32:36,784] [INFO] [logging.py:60:log_dist] [Rank 0] step=15290, skipped=14, lr=[0.0005862273892334501, 0.0005862273892334501], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15290 loss: 2.4053 iter time (s): 62.977 samples/sec: 16.260 %comms: 0.002917777161491324 %optimizer_step 0.05740868153444995 %forward: 23.133271632017777 %backward: 62.02891057270358 [2025-04-05 01:32:36,784] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21818.41 | forward: 145687.29 | backward_microstep: 390658.67 | backward: 390641.84 | backward_inner_microstep: 390623.99 | backward_inner: 390616.95 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.38 | reduce_grads: 0.21 | step: 361.54 | _step_clipping: 0.14 | _step_step: 359.66 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.260 | iteration 15290/ 143000 | elapsed time per iteration (ms): 62978.0 | learning rate: 5.862E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.415342E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 01:43:05,649] [INFO] [logging.py:60:log_dist] [Rank 0] step=15300, skipped=14, lr=[0.0005862076419682862, 0.0005862076419682862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15300 loss: 2.4003 iter time (s): 62.886 samples/sec: 16.283 %comms: 0.0029383227515068435 %optimizer_step 0.058700865776180756 %forward: 23.187010807572822 %backward: 62.126212092635704 [2025-04-05 01:43:05,650] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20665.34 | forward: 145813.68 | backward_microstep: 390705.37 | backward: 390686.48 | backward_inner_microstep: 390668.54 | backward_inner: 390661.11 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.87 | reduce_tied_grads: 0.37 | comms: 18.48 | reduce_grads: 0.23 | step: 369.15 | _step_clipping: 0.15 | _step_step: 366.94 | _step_zero_grad: 0.54 | _step_check_overflow: 0.81 samples/sec: 16.283 | iteration 15300/ 143000 | elapsed time per iteration (ms): 62886.6 | learning rate: 5.862E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.410439E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 01:53:38,749] [INFO] [logging.py:60:log_dist] [Rank 0] step=15310, skipped=14, lr=[0.0005861878808894581, 0.0005861878808894581], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15310 loss: 2.4385 iter time (s): 63.309 samples/sec: 16.175 %comms: 0.0028877745721726344 %optimizer_step 0.05671367670753918 %forward: 23.048051924525467 %backward: 61.710017873694326 [2025-04-05 01:53:38,749] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24811.77 | forward: 145914.62 | backward_microstep: 390696.00 | backward: 390679.16 | backward_inner_microstep: 390660.89 | backward_inner: 390653.72 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.28 | reduce_grads: 0.21 | step: 359.05 | _step_clipping: 0.14 | _step_step: 357.11 | _step_zero_grad: 0.56 | _step_check_overflow: 0.60 samples/sec: 16.174 | iteration 15310/ 143000 | elapsed time per iteration (ms): 63309.9 | learning rate: 5.862E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.419721E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 02:04:06,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=15320, skipped=14, lr=[0.0005861681059979196, 0.0005861681059979196], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15320 loss: 2.4210 iter time (s): 62.806 samples/sec: 16.304 %comms: 0.002978955065006994 %optimizer_step 0.05959743648499598 %forward: 23.30232916716037 %backward: 62.23624103461799 [2025-04-05 02:04:06,817] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19068.31 | forward: 146352.88 | backward_microstep: 390899.51 | backward: 390881.65 | backward_inner_microstep: 390861.96 | backward_inner: 390854.14 | backward_allreduce_microstep: 9.32 | backward_allreduce: 3.20 | reduce_tied_grads: 0.36 | comms: 18.71 | reduce_grads: 0.21 | step: 374.31 | _step_clipping: 0.13 | _step_step: 372.10 | _step_zero_grad: 0.60 | _step_check_overflow: 0.77 samples/sec: 16.304 | iteration 15320/ 143000 | elapsed time per iteration (ms): 62806.8 | learning rate: 5.862E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.415560E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 02:14:36,512] [INFO] [logging.py:60:log_dist] [Rank 0] step=15330, skipped=14, lr=[0.000586148317294625, 0.000586148317294625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15330 loss: 2.4161 iter time (s): 62.969 samples/sec: 16.262 %comms: 0.002878677236326095 %optimizer_step 0.05628741826999183 %forward: 23.16609794317317 %backward: 62.034962885793576 [2025-04-05 02:14:36,513] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21538.99 | forward: 145874.47 | backward_microstep: 390643.25 | backward: 390627.60 | backward_inner_microstep: 390609.76 | backward_inner: 390602.59 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.84 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.20 | step: 354.44 | _step_clipping: 0.12 | _step_step: 352.69 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.262 | iteration 15330/ 143000 | elapsed time per iteration (ms): 62969.5 | learning rate: 5.861E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.413337E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 02:24:58,213] [INFO] [logging.py:60:log_dist] [Rank 0] step=15340, skipped=14, lr=[0.0005861285147805296, 0.0005861285147805296], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15340 loss: 2.4176 iter time (s): 62.169 samples/sec: 16.471 %comms: 0.0029126274214671235 %optimizer_step 0.05607746397183347 %forward: 23.374510818742465 %backward: 62.74829111406465 [2025-04-05 02:24:58,213] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14697.92 | forward: 145318.11 | backward_microstep: 390112.98 | backward: 390102.85 | backward_inner_microstep: 390086.85 | backward_inner: 390080.56 | backward_allreduce_microstep: 7.61 | backward_allreduce: 2.63 | reduce_tied_grads: 0.32 | comms: 18.11 | reduce_grads: 0.18 | step: 348.63 | _step_clipping: 0.12 | _step_step: 346.87 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.471 | iteration 15340/ 143000 | elapsed time per iteration (ms): 62170.1 | learning rate: 5.861E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.410384E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 02:35:23,074] [INFO] [logging.py:60:log_dist] [Rank 0] step=15350, skipped=14, lr=[0.000586108698456589, 0.000586108698456589], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15350 loss: 2.4121 iter time (s): 62.486 samples/sec: 16.388 %comms: 0.0029175063631003413 %optimizer_step 0.05715585095627673 %forward: 23.26732451434838 %backward: 62.4413653640711 [2025-04-05 02:35:23,075] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17686.92 | forward: 145387.16 | backward_microstep: 390179.46 | backward: 390168.32 | backward_inner_microstep: 390151.15 | backward_inner: 390144.55 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.23 | reduce_grads: 0.21 | step: 357.14 | _step_clipping: 0.12 | _step_step: 355.29 | _step_zero_grad: 0.56 | _step_check_overflow: 0.55 samples/sec: 16.388 | iteration 15350/ 143000 | elapsed time per iteration (ms): 62486.2 | learning rate: 5.861E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.413567E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 02:45:51,050] [INFO] [logging.py:60:log_dist] [Rank 0] step=15360, skipped=14, lr=[0.0005860888683237596, 0.0005860888683237596], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15360 loss: 2.3961 iter time (s): 62.797 samples/sec: 16.307 %comms: 0.002921946556944828 %optimizer_step 0.05914636256774432 %forward: 23.20713334747265 %backward: 62.15122974685002 [2025-04-05 02:45:51,050] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20299.83 | forward: 145733.71 | backward_microstep: 390304.65 | backward: 390290.75 | backward_inner_microstep: 390273.51 | backward_inner: 390266.81 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.22 | step: 371.42 | _step_clipping: 0.11 | _step_step: 369.64 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.306 | iteration 15360/ 143000 | elapsed time per iteration (ms): 62797.5 | learning rate: 5.861E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.410239E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 02:56:13,342] [INFO] [logging.py:60:log_dist] [Rank 0] step=15370, skipped=14, lr=[0.0005860690243829986, 0.0005860690243829986], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15370 loss: 2.3889 iter time (s): 62.229 samples/sec: 16.455 %comms: 0.0029358735906653533 %optimizer_step 0.0563707039324354 %forward: 23.34345692747626 %backward: 62.70152775309336 [2025-04-05 02:56:13,342] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15233.15 | forward: 145263.13 | backward_microstep: 390192.72 | backward: 390183.01 | backward_inner_microstep: 390167.05 | backward_inner: 390160.82 | backward_allreduce_microstep: 7.63 | backward_allreduce: 2.62 | reduce_tied_grads: 0.29 | comms: 18.27 | reduce_grads: 0.19 | step: 350.79 | _step_clipping: 0.11 | _step_step: 349.11 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.455 | iteration 15370/ 143000 | elapsed time per iteration (ms): 62229.2 | learning rate: 5.861E-04 | approx flops per GPU: 71.0TFLOPS | lm_loss: 2.400533E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 03:06:36,667] [INFO] [logging.py:60:log_dist] [Rank 0] step=15380, skipped=14, lr=[0.0005860491666352638, 0.0005860491666352638], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15380 loss: 2.4309 iter time (s): 62.332 samples/sec: 16.428 %comms: 0.0029082072818650785 %optimizer_step 0.056460921178466204 %forward: 23.318898927111036 %backward: 62.60422842276947 [2025-04-05 03:06:36,668] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16138.98 | forward: 145351.39 | backward_microstep: 390235.89 | backward: 390224.76 | backward_inner_microstep: 390208.03 | backward_inner: 390201.51 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.90 | reduce_tied_grads: 0.29 | comms: 18.13 | reduce_grads: 0.19 | step: 351.93 | _step_clipping: 0.10 | _step_step: 350.24 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.428 | iteration 15380/ 143000 | elapsed time per iteration (ms): 62332.6 | learning rate: 5.860E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.409557E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 03:17:07,378] [INFO] [logging.py:60:log_dist] [Rank 0] step=15390, skipped=14, lr=[0.0005860292950815134, 0.0005860292950815134], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15390 loss: 2.4062 iter time (s): 63.070 samples/sec: 16.236 %comms: 0.0028762728929848896 %optimizer_step 0.05621671740354137 %forward: 23.049049609207742 %backward: 61.8510277639998 [2025-04-05 03:17:07,378] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23677.77 | forward: 145371.48 | backward_microstep: 390106.33 | backward: 390097.44 | backward_inner_microstep: 390080.70 | backward_inner: 390074.35 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.78 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.19 | step: 354.56 | _step_clipping: 0.11 | _step_step: 352.85 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.236 | iteration 15390/ 143000 | elapsed time per iteration (ms): 63071.1 | learning rate: 5.860E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.406810E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 03:27:28,564] [INFO] [logging.py:60:log_dist] [Rank 0] step=15400, skipped=14, lr=[0.0005860094097227067, 0.0005860094097227067], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15400 loss: 2.4070 iter time (s): 62.118 samples/sec: 16.485 %comms: 0.002904905455252893 %optimizer_step 0.05710959301864657 %forward: 23.375114074631632 %backward: 62.787548549127195 [2025-04-05 03:27:28,565] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14382.51 | forward: 145201.68 | backward_microstep: 390031.93 | backward: 390024.09 | backward_inner_microstep: 390006.06 | backward_inner: 389999.84 | backward_allreduce_microstep: 7.92 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 18.04 | reduce_grads: 0.19 | step: 354.75 | _step_clipping: 0.12 | _step_step: 352.91 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.485 | iteration 15400/ 143000 | elapsed time per iteration (ms): 62118.7 | learning rate: 5.860E-04 | approx flops per GPU: 71.1TFLOPS | lm_loss: 2.414927E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 03:37:56,548] [INFO] [logging.py:60:log_dist] [Rank 0] step=15410, skipped=14, lr=[0.0005859895105598033, 0.0005859895105598033], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15410 loss: 2.4015 iter time (s): 62.798 samples/sec: 16.306 %comms: 0.0028821181376378654 %optimizer_step 0.057584350600499976 %forward: 23.16263466218436 %backward: 62.12038312578462 [2025-04-05 03:37:56,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20847.23 | forward: 145456.26 | backward_microstep: 390111.38 | backward: 390102.38 | backward_inner_microstep: 390085.76 | backward_inner: 390079.38 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.27 | step: 361.62 | _step_clipping: 0.12 | _step_step: 359.85 | _step_zero_grad: 0.54 | _step_check_overflow: 0.51 samples/sec: 16.306 | iteration 15410/ 143000 | elapsed time per iteration (ms): 62798.4 | learning rate: 5.860E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.417269E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 03:48:26,079] [INFO] [logging.py:60:log_dist] [Rank 0] step=15420, skipped=14, lr=[0.0005859695975937638, 0.0005859695975937638], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15420 loss: 2.4251 iter time (s): 62.952 samples/sec: 16.266 %comms: 0.0028820446533200294 %optimizer_step 0.057889717389934876 %forward: 23.131047547126084 %backward: 62.025578757944 [2025-04-05 03:48:26,080] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21787.49 | forward: 145615.61 | backward_microstep: 390477.44 | backward: 390466.21 | backward_inner_microstep: 390448.47 | backward_inner: 390441.80 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.14 | reduce_grads: 0.20 | step: 364.43 | _step_clipping: 0.13 | _step_step: 362.48 | _step_zero_grad: 0.61 | _step_check_overflow: 0.60 samples/sec: 16.266 | iteration 15420/ 143000 | elapsed time per iteration (ms): 62953.1 | learning rate: 5.860E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.412007E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 03:58:56,848] [INFO] [logging.py:60:log_dist] [Rank 0] step=15430, skipped=14, lr=[0.0005859496708255492, 0.0005859496708255492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15430 loss: 2.4138 iter time (s): 63.076 samples/sec: 16.234 %comms: 0.00284202382368228 %optimizer_step 0.056572145639024574 %forward: 23.065310850491805 %backward: 61.89044622551458 [2025-04-05 03:58:56,849] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23245.07 | forward: 145487.61 | backward_microstep: 390392.06 | backward: 390382.47 | backward_inner_microstep: 390364.31 | backward_inner: 390357.67 | backward_allreduce_microstep: 8.75 | backward_allreduce: 2.95 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.20 | step: 356.84 | _step_clipping: 0.13 | _step_step: 355.10 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 16.234 | iteration 15430/ 143000 | elapsed time per iteration (ms): 63076.9 | learning rate: 5.859E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.417206E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 04:09:25,963] [INFO] [logging.py:60:log_dist] [Rank 0] step=15440, skipped=14, lr=[0.0005859297302561211, 0.0005859297302561211], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15440 loss: 2.4083 iter time (s): 62.910 samples/sec: 16.277 %comms: 0.0028393392665609814 %optimizer_step 0.054955654053290816 %forward: 23.1006937304724 %backward: 62.050183171945925 [2025-04-05 04:09:25,964] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21792.14 | forward: 145326.76 | backward_microstep: 390366.96 | backward: 390358.50 | backward_inner_microstep: 390341.79 | backward_inner: 390335.45 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.75 | reduce_tied_grads: 0.27 | comms: 17.86 | reduce_grads: 0.18 | step: 345.73 | _step_clipping: 0.12 | _step_step: 344.03 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.277 | iteration 15440/ 143000 | elapsed time per iteration (ms): 62911.5 | learning rate: 5.859E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.403258E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 04:19:59,135] [INFO] [logging.py:60:log_dist] [Rank 0] step=15450, skipped=14, lr=[0.0005859097758864422, 0.0005859097758864422], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15450 loss: 2.4098 iter time (s): 63.317 samples/sec: 16.173 %comms: 0.0028506346110078436 %optimizer_step 0.05716131653075668 %forward: 23.04251256482988 %backward: 61.66672766401168 [2025-04-05 04:19:59,136] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25168.86 | forward: 145897.28 | backward_microstep: 390463.52 | backward: 390452.56 | backward_inner_microstep: 390435.09 | backward_inner: 390428.49 | backward_allreduce_microstep: 8.45 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.23 | step: 361.93 | _step_clipping: 0.15 | _step_step: 360.12 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.173 | iteration 15450/ 143000 | elapsed time per iteration (ms): 63317.2 | learning rate: 5.859E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.408798E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 04:30:34,611] [INFO] [logging.py:60:log_dist] [Rank 0] step=15460, skipped=14, lr=[0.0005858898077174755, 0.0005858898077174755], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15460 loss: 2.4204 iter time (s): 63.547 samples/sec: 16.114 %comms: 0.0029178497339367774 %optimizer_step 0.057266520047872214 %forward: 22.95434585819128 %backward: 61.48438726388257 [2025-04-05 04:30:34,612] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27143.06 | forward: 145867.92 | backward_microstep: 390728.17 | backward: 390714.66 | backward_inner_microstep: 390696.50 | backward_inner: 390689.39 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 18.54 | reduce_grads: 0.20 | step: 363.91 | _step_clipping: 0.13 | _step_step: 361.70 | _step_zero_grad: 0.75 | _step_check_overflow: 0.68 samples/sec: 16.114 | iteration 15460/ 143000 | elapsed time per iteration (ms): 63547.6 | learning rate: 5.859E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.409532E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 04:41:04,458] [INFO] [logging.py:60:log_dist] [Rank 0] step=15470, skipped=14, lr=[0.0005858698257501846, 0.0005858698257501846], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15470 loss: 2.4058 iter time (s): 62.984 samples/sec: 16.258 %comms: 0.0029023244934603025 %optimizer_step 0.05733135638722499 %forward: 23.11770269629928 %backward: 62.004817134895006 [2025-04-05 04:41:04,459] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21997.12 | forward: 145604.77 | backward_microstep: 390545.29 | backward: 390531.75 | backward_inner_microstep: 390514.23 | backward_inner: 390507.48 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.28 | reduce_grads: 0.21 | step: 361.10 | _step_clipping: 0.14 | _step_step: 359.20 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.258 | iteration 15470/ 143000 | elapsed time per iteration (ms): 62984.7 | learning rate: 5.859E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.406402E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 04:51:37,901] [INFO] [logging.py:60:log_dist] [Rank 0] step=15480, skipped=14, lr=[0.000585849829985534, 0.000585849829985534], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15480 loss: 2.4012 iter time (s): 63.344 samples/sec: 16.166 %comms: 0.0028438048331058396 %optimizer_step 0.0570142689543851 %forward: 23.031642841545416 %backward: 61.66819529159023 [2025-04-05 04:51:37,902] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25210.84 | forward: 145890.98 | backward_microstep: 390644.27 | backward: 390629.25 | backward_inner_microstep: 390611.49 | backward_inner: 390604.61 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.23 | step: 361.15 | _step_clipping: 0.13 | _step_step: 359.33 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.166 | iteration 15480/ 143000 | elapsed time per iteration (ms): 63344.3 | learning rate: 5.858E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.407894E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 05:02:13,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=15490, skipped=14, lr=[0.0005858298204244889, 0.0005858298204244889], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15490 loss: 2.4080 iter time (s): 63.607 samples/sec: 16.099 %comms: 0.0029040871695590674 %optimizer_step 0.057992008570732376 %forward: 22.956643586963306 %backward: 61.40758245513936 [2025-04-05 05:02:13,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27750.22 | forward: 146019.76 | backward_microstep: 390609.34 | backward: 390593.71 | backward_inner_microstep: 390573.97 | backward_inner: 390567.01 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.89 | reduce_tied_grads: 0.53 | comms: 18.47 | reduce_grads: 0.22 | step: 368.87 | _step_clipping: 0.14 | _step_step: 366.76 | _step_zero_grad: 0.54 | _step_check_overflow: 0.78 samples/sec: 16.099 | iteration 15490/ 143000 | elapsed time per iteration (ms): 63607.4 | learning rate: 5.858E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.412763E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 05:12:57,433] [INFO] [logging.py:60:log_dist] [Rank 0] step=15500, skipped=14, lr=[0.0005858097970680151, 0.0005858097970680151], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15500 loss: 2.4176 iter time (s): 64.345 samples/sec: 15.914 %comms: 0.0028009886233741955 %optimizer_step 0.0544205996760427 %forward: 22.652588611944218 %backward: 60.68984156128966 [2025-04-05 05:12:57,434] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35491.77 | forward: 145758.51 | backward_microstep: 390520.99 | backward: 390509.93 | backward_inner_microstep: 390492.81 | backward_inner: 390486.32 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.75 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.21 | step: 350.17 | _step_clipping: 0.13 | _step_step: 348.43 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 15.914 | iteration 15500/ 143000 | elapsed time per iteration (ms): 64345.8 | learning rate: 5.858E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.413938E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 05:23:35,236] [INFO] [logging.py:60:log_dist] [Rank 0] step=15510, skipped=14, lr=[0.0005857897599170786, 0.0005857897599170786], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15510 loss: 2.3795 iter time (s): 63.780 samples/sec: 16.055 %comms: 0.0028734831659940558 %optimizer_step 0.05870609581150217 %forward: 22.84251931674564 %backward: 61.23634489167243 [2025-04-05 05:23:35,237] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29820.45 | forward: 145688.97 | backward_microstep: 390578.65 | backward: 390563.75 | backward_inner_microstep: 390544.63 | backward_inner: 390534.40 | backward_allreduce_microstep: 9.95 | backward_allreduce: 2.86 | reduce_tied_grads: 0.35 | comms: 18.33 | reduce_grads: 0.21 | step: 374.43 | _step_clipping: 0.14 | _step_step: 372.41 | _step_zero_grad: 0.56 | _step_check_overflow: 0.67 samples/sec: 16.055 | iteration 15510/ 143000 | elapsed time per iteration (ms): 63780.4 | learning rate: 5.858E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.406332E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 05:34:14,847] [INFO] [logging.py:60:log_dist] [Rank 0] step=15520, skipped=14, lr=[0.0005857697089726468, 0.0005857697089726468], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15520 loss: 2.4025 iter time (s): 63.960 samples/sec: 16.010 %comms: 0.0028132546772757718 %optimizer_step 0.05603979758720754 %forward: 22.758271489403292 %backward: 61.035154330323984 [2025-04-05 05:34:14,847] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31965.87 | forward: 145562.81 | backward_microstep: 390395.02 | backward: 390383.27 | backward_inner_microstep: 390366.35 | backward_inner: 390357.97 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.78 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.21 | step: 358.43 | _step_clipping: 0.12 | _step_step: 356.55 | _step_zero_grad: 0.52 | _step_check_overflow: 0.65 samples/sec: 16.010 | iteration 15520/ 143000 | elapsed time per iteration (ms): 63961.0 | learning rate: 5.858E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.409042E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 05:44:44,087] [INFO] [logging.py:60:log_dist] [Rank 0] step=15530, skipped=14, lr=[0.0005857496442356877, 0.0005857496442356877], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15530 loss: 2.4129 iter time (s): 62.923 samples/sec: 16.274 %comms: 0.00289064835940792 %optimizer_step 0.05759585596651195 %forward: 23.1460768774831 %backward: 62.0471098280349 [2025-04-05 05:44:44,088] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21459.95 | forward: 145643.07 | backward_microstep: 390430.29 | backward: 390421.75 | backward_inner_microstep: 390405.01 | backward_inner: 390398.62 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.19 | reduce_grads: 0.20 | step: 362.41 | _step_clipping: 0.13 | _step_step: 360.62 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.274 | iteration 15530/ 143000 | elapsed time per iteration (ms): 62924.0 | learning rate: 5.857E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.408871E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 05:55:12,577] [INFO] [logging.py:60:log_dist] [Rank 0] step=15540, skipped=14, lr=[0.0005857295657071692, 0.0005857295657071692], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15540 loss: 2.4097 iter time (s): 62.848 samples/sec: 16.293 %comms: 0.0028771765179902353 %optimizer_step 0.05698379274677077 %forward: 23.159215361414926 %backward: 62.12831565554109 [2025-04-05 05:55:12,578] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20776.23 | forward: 145552.16 | backward_microstep: 390476.14 | backward: 390467.06 | backward_inner_microstep: 390450.57 | backward_inner: 390444.18 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.73 | reduce_tied_grads: 0.32 | comms: 18.08 | reduce_grads: 0.20 | step: 358.13 | _step_clipping: 0.14 | _step_step: 356.29 | _step_zero_grad: 0.48 | _step_check_overflow: 0.64 samples/sec: 16.293 | iteration 15540/ 143000 | elapsed time per iteration (ms): 62849.1 | learning rate: 5.857E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.408962E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 06:05:43,702] [INFO] [logging.py:60:log_dist] [Rank 0] step=15550, skipped=14, lr=[0.0005857094733880606, 0.0005857094733880606], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15550 loss: 2.4145 iter time (s): 63.112 samples/sec: 16.225 %comms: 0.002911110090161624 %optimizer_step 0.05870593869272793 %forward: 23.647726039108914 %backward: 61.90535868898637 [2025-04-05 06:05:43,702] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19401.96 | forward: 149245.02 | backward_microstep: 390709.56 | backward: 390695.78 | backward_inner_microstep: 390678.50 | backward_inner: 390665.57 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.78 | reduce_tied_grads: 0.35 | comms: 18.37 | reduce_grads: 0.20 | step: 370.50 | _step_clipping: 0.12 | _step_step: 368.47 | _step_zero_grad: 0.59 | _step_check_overflow: 0.66 samples/sec: 16.225 | iteration 15550/ 143000 | elapsed time per iteration (ms): 63112.4 | learning rate: 5.857E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.414111E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 06:16:18,143] [INFO] [logging.py:60:log_dist] [Rank 0] step=15560, skipped=14, lr=[0.0005856893672793318, 0.0005856893672793318], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15560 loss: 2.4291 iter time (s): 63.443 samples/sec: 16.140 %comms: 0.0028857056909633682 %optimizer_step 0.05850700738961776 %forward: 22.974925845492226 %backward: 61.587583727272424 [2025-04-05 06:16:18,144] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26178.90 | forward: 145760.95 | backward_microstep: 390745.35 | backward: 390733.13 | backward_inner_microstep: 390716.22 | backward_inner: 390709.64 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.75 | reduce_tied_grads: 0.38 | comms: 18.31 | reduce_grads: 0.23 | step: 371.19 | _step_clipping: 0.16 | _step_step: 369.19 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.140 | iteration 15560/ 143000 | elapsed time per iteration (ms): 63444.1 | learning rate: 5.857E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.412387E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 06:26:50,846] [INFO] [logging.py:60:log_dist] [Rank 0] step=15570, skipped=14, lr=[0.000585669247381953, 0.000585669247381953], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15570 loss: 2.4029 iter time (s): 63.270 samples/sec: 16.185 %comms: 0.002885265256358465 %optimizer_step 0.05835819874754971 %forward: 23.01166587577582 %backward: 61.74226082489521 [2025-04-05 06:26:50,847] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24713.83 | forward: 145594.19 | backward_microstep: 390654.71 | backward: 390641.62 | backward_inner_microstep: 390622.44 | backward_inner: 390615.72 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.88 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.23 | step: 369.23 | _step_clipping: 0.14 | _step_step: 367.32 | _step_zero_grad: 0.58 | _step_check_overflow: 0.51 samples/sec: 16.185 | iteration 15570/ 143000 | elapsed time per iteration (ms): 63270.4 | learning rate: 5.857E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.406171E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 06:37:20,069] [INFO] [logging.py:60:log_dist] [Rank 0] step=15580, skipped=14, lr=[0.0005856491136968953, 0.0005856491136968953], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15580 loss: 2.3978 iter time (s): 62.922 samples/sec: 16.274 %comms: 0.002888379118679218 %optimizer_step 0.0567520951460485 %forward: 23.13346733237988 %backward: 62.06692317277878 [2025-04-05 06:37:20,070] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21391.48 | forward: 145559.70 | backward_microstep: 390547.95 | backward: 390535.61 | backward_inner_microstep: 390518.35 | backward_inner: 390511.76 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.20 | step: 357.09 | _step_clipping: 0.14 | _step_step: 355.34 | _step_zero_grad: 0.53 | _step_check_overflow: 0.49 samples/sec: 16.274 | iteration 15580/ 143000 | elapsed time per iteration (ms): 62922.3 | learning rate: 5.856E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.409066E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 06:47:47,628] [INFO] [logging.py:60:log_dist] [Rank 0] step=15590, skipped=14, lr=[0.0005856289662251303, 0.0005856289662251303], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15590 loss: 2.4180 iter time (s): 62.755 samples/sec: 16.317 %comms: 0.0029440584090976176 %optimizer_step 0.0570893622310378 %forward: 23.23440695669012 %backward: 62.26377237975862 [2025-04-05 06:47:47,629] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19280.22 | forward: 145808.26 | backward_microstep: 390751.23 | backward: 390738.29 | backward_inner_microstep: 390720.58 | backward_inner: 390713.63 | backward_allreduce_microstep: 8.42 | backward_allreduce: 3.03 | reduce_tied_grads: 0.33 | comms: 18.48 | reduce_grads: 0.19 | step: 358.27 | _step_clipping: 0.13 | _step_step: 356.40 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 16.317 | iteration 15590/ 143000 | elapsed time per iteration (ms): 62755.9 | learning rate: 5.856E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.408644E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 06:58:17,660] [INFO] [logging.py:60:log_dist] [Rank 0] step=15600, skipped=14, lr=[0.0005856088049676308, 0.0005856088049676308], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15600 loss: 2.4208 iter time (s): 63.003 samples/sec: 16.253 %comms: 0.0029158512342100417 %optimizer_step 0.058098256372087355 %forward: 23.12019438538979 %backward: 61.98739450628038 [2025-04-05 06:58:17,661] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22102.67 | forward: 145663.30 | backward_microstep: 390547.73 | backward: 390536.86 | backward_inner_microstep: 390516.46 | backward_inner: 390509.71 | backward_allreduce_microstep: 9.86 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.37 | reduce_grads: 0.21 | step: 366.03 | _step_clipping: 0.14 | _step_step: 363.97 | _step_zero_grad: 0.54 | _step_check_overflow: 0.75 samples/sec: 16.253 | iteration 15600/ 143000 | elapsed time per iteration (ms): 63003.2 | learning rate: 5.856E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.419423E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 07:08:44,724] [INFO] [logging.py:60:log_dist] [Rank 0] step=15610, skipped=14, lr=[0.0005855886299253697, 0.0005855886299253697], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15610 loss: 2.4041 iter time (s): 62.706 samples/sec: 16.330 %comms: 0.0029175258294759617 %optimizer_step 0.05572929455096902 %forward: 23.195879428005124 %backward: 62.2296697762043 [2025-04-05 07:08:44,725] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19736.61 | forward: 145451.57 | backward_microstep: 390227.15 | backward: 390215.99 | backward_inner_microstep: 390199.26 | backward_inner: 390192.76 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.72 | reduce_tied_grads: 0.29 | comms: 18.29 | reduce_grads: 0.19 | step: 349.45 | _step_clipping: 0.12 | _step_step: 347.68 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.330 | iteration 15610/ 143000 | elapsed time per iteration (ms): 62706.4 | learning rate: 5.856E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.397042E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 07:19:15,692] [INFO] [logging.py:60:log_dist] [Rank 0] step=15620, skipped=14, lr=[0.0005855684410993207, 0.0005855684410993207], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15620 loss: 2.4071 iter time (s): 63.096 samples/sec: 16.229 %comms: 0.0029001949969261797 %optimizer_step 0.05684343651671569 %forward: 23.072299517750157 %backward: 61.862298776309586 [2025-04-05 07:19:15,692] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23373.17 | forward: 145577.24 | backward_microstep: 390336.54 | backward: 390327.05 | backward_inner_microstep: 390310.61 | backward_inner: 390304.23 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.70 | reduce_tied_grads: 0.32 | comms: 18.30 | reduce_grads: 0.19 | step: 358.66 | _step_clipping: 0.13 | _step_step: 356.85 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.229 | iteration 15620/ 143000 | elapsed time per iteration (ms): 63096.7 | learning rate: 5.856E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.399248E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 07:29:41,103] [INFO] [logging.py:60:log_dist] [Rank 0] step=15630, skipped=14, lr=[0.000585548238490458, 0.000585548238490458], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15630 loss: 2.4288 iter time (s): 62.541 samples/sec: 16.373 %comms: 0.0029051027551389532 %optimizer_step 0.05590087889279647 %forward: 23.229628598805164 %backward: 62.39322422151656 [2025-04-05 07:29:41,104] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18271.81 | forward: 145279.50 | backward_microstep: 390218.98 | backward: 390210.99 | backward_inner_microstep: 390194.91 | backward_inner: 390188.79 | backward_allreduce_microstep: 7.72 | backward_allreduce: 2.66 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.20 | step: 349.61 | _step_clipping: 0.12 | _step_step: 347.90 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.373 | iteration 15630/ 143000 | elapsed time per iteration (ms): 62541.2 | learning rate: 5.855E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.412164E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 07:40:13,398] [INFO] [logging.py:60:log_dist] [Rank 0] step=15640, skipped=14, lr=[0.0005855280220997571, 0.0005855280220997571], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15640 loss: 2.3784 iter time (s): 63.229 samples/sec: 16.195 %comms: 0.0028666552800351735 %optimizer_step 0.056028246530335994 %forward: 23.029779941288876 %backward: 61.74453815348231 [2025-04-05 07:40:13,399] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24569.26 | forward: 145614.67 | backward_microstep: 390413.30 | backward: 390403.67 | backward_inner_microstep: 390386.85 | backward_inner: 390380.44 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.18 | step: 354.26 | _step_clipping: 0.13 | _step_step: 352.61 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.195 | iteration 15640/ 143000 | elapsed time per iteration (ms): 63229.5 | learning rate: 5.855E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.403657E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 07:50:37,845] [INFO] [logging.py:60:log_dist] [Rank 0] step=15650, skipped=14, lr=[0.0005855077919281935, 0.0005855077919281935], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15650 loss: 2.3912 iter time (s): 62.444 samples/sec: 16.399 %comms: 0.002866407429716451 %optimizer_step 0.05574802403084744 %forward: 23.28388127238559 %backward: 62.48784049571974 [2025-04-05 07:50:37,846] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17168.05 | forward: 145394.21 | backward_microstep: 390210.05 | backward: 390199.98 | backward_inner_microstep: 390183.53 | backward_inner: 390177.19 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.68 | reduce_tied_grads: 0.29 | comms: 17.90 | reduce_grads: 0.19 | step: 348.11 | _step_clipping: 0.11 | _step_step: 346.46 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.399 | iteration 15650/ 143000 | elapsed time per iteration (ms): 62444.7 | learning rate: 5.855E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.397205E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 08:01:09,131] [INFO] [logging.py:60:log_dist] [Rank 0] step=15660, skipped=14, lr=[0.0005854875479767434, 0.0005854875479767434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15660 loss: 2.3988 iter time (s): 63.128 samples/sec: 16.221 %comms: 0.0028515585134647925 %optimizer_step 0.05584919901997604 %forward: 23.09703837419069 %backward: 61.8504733553897 [2025-04-05 08:01:09,132] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23321.92 | forward: 145806.98 | backward_microstep: 390458.27 | backward: 390449.66 | backward_inner_microstep: 390432.27 | backward_inner: 390425.93 | backward_allreduce_microstep: 8.58 | backward_allreduce: 3.12 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.22 | step: 352.56 | _step_clipping: 0.13 | _step_step: 350.78 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.221 | iteration 15660/ 143000 | elapsed time per iteration (ms): 63128.6 | learning rate: 5.855E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.405411E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 08:11:39,500] [INFO] [logging.py:60:log_dist] [Rank 0] step=15670, skipped=14, lr=[0.0005854672902463842, 0.0005854672902463842], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15670 loss: 2.3988 iter time (s): 63.036 samples/sec: 16.245 %comms: 0.0029005298222723424 %optimizer_step 0.0576725013578022 %forward: 23.082627828913253 %backward: 61.948910065895014 [2025-04-05 08:11:39,500] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22607.24 | forward: 145504.16 | backward_microstep: 390513.42 | backward: 390502.52 | backward_inner_microstep: 390483.78 | backward_inner: 390476.92 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.84 | reduce_tied_grads: 0.36 | comms: 18.28 | reduce_grads: 0.23 | step: 363.55 | _step_clipping: 0.14 | _step_step: 361.58 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.244 | iteration 15670/ 143000 | elapsed time per iteration (ms): 63036.9 | learning rate: 5.855E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.396189E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 08:22:13,271] [INFO] [logging.py:60:log_dist] [Rank 0] step=15680, skipped=14, lr=[0.0005854470187380935, 0.0005854470187380935], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15680 loss: 2.4176 iter time (s): 63.377 samples/sec: 16.157 %comms: 0.0028773540213866094 %optimizer_step 0.057723515204819546 %forward: 22.976146842754062 %backward: 61.612099884372554 [2025-04-05 08:22:13,272] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25928.19 | forward: 145614.94 | backward_microstep: 390490.37 | backward: 390476.37 | backward_inner_microstep: 390458.31 | backward_inner: 390451.26 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.24 | reduce_grads: 0.20 | step: 365.83 | _step_clipping: 0.12 | _step_step: 364.00 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.157 | iteration 15680/ 143000 | elapsed time per iteration (ms): 63377.1 | learning rate: 5.854E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.410946E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 08:32:40,054] [INFO] [logging.py:60:log_dist] [Rank 0] step=15690, skipped=14, lr=[0.0005854267334528497, 0.0005854267334528497], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15690 loss: 2.4166 iter time (s): 62.678 samples/sec: 16.338 %comms: 0.0029077248305020586 %optimizer_step 0.05703185715055468 %forward: 23.237415879620695 %backward: 62.26815372595893 [2025-04-05 08:32:40,055] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19162.08 | forward: 145646.81 | backward_microstep: 390294.33 | backward: 390282.55 | backward_inner_microstep: 390265.15 | backward_inner: 390256.62 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 18.22 | reduce_grads: 0.20 | step: 357.46 | _step_clipping: 0.13 | _step_step: 355.58 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.337 | iteration 15690/ 143000 | elapsed time per iteration (ms): 62678.3 | learning rate: 5.854E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.412598E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 08:43:14,207] [INFO] [logging.py:60:log_dist] [Rank 0] step=15700, skipped=14, lr=[0.0005854064343916318, 0.0005854064343916318], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15700 loss: 2.3982 iter time (s): 63.415 samples/sec: 16.148 %comms: 0.0028950236160323944 %optimizer_step 0.057426565504200416 %forward: 23.681753626331105 %backward: 61.551449081076846 [2025-04-05 08:43:14,208] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21937.55 | forward: 150177.15 | backward_microstep: 390338.30 | backward: 390326.71 | backward_inner_microstep: 390309.26 | backward_inner: 390302.49 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.36 | reduce_grads: 0.20 | step: 364.17 | _step_clipping: 0.13 | _step_step: 362.18 | _step_zero_grad: 0.53 | _step_check_overflow: 0.73 samples/sec: 16.148 | iteration 15700/ 143000 | elapsed time per iteration (ms): 63415.3 | learning rate: 5.854E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.402155E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 08:53:45,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=15710, skipped=14, lr=[0.0005853861215554197, 0.0005853861215554197], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15710 loss: 2.3946 iter time (s): 63.138 samples/sec: 16.218 %comms: 0.002847865495866593 %optimizer_step 0.05596013972825845 %forward: 23.054068803892655 %backward: 61.83141526833703 [2025-04-05 08:53:45,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23719.15 | forward: 145558.46 | backward_microstep: 390404.09 | backward: 390390.34 | backward_inner_microstep: 390373.14 | backward_inner: 390366.41 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.20 | step: 353.32 | _step_clipping: 0.12 | _step_step: 351.43 | _step_zero_grad: 0.53 | _step_check_overflow: 0.64 samples/sec: 16.218 | iteration 15710/ 143000 | elapsed time per iteration (ms): 63138.4 | learning rate: 5.854E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.399837E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 09:04:11,813] [INFO] [logging.py:60:log_dist] [Rank 0] step=15720, skipped=14, lr=[0.0005853657949451936, 0.0005853657949451936], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15720 loss: 2.4123 iter time (s): 62.622 samples/sec: 16.352 %comms: 0.002898682068500529 %optimizer_step 0.05694555820332352 %forward: 23.243404053807424 %backward: 62.33290283846677 [2025-04-05 09:04:11,814] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18612.76 | forward: 145553.82 | backward_microstep: 390352.43 | backward: 390338.34 | backward_inner_microstep: 390320.79 | backward_inner: 390313.88 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.84 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.19 | step: 356.60 | _step_clipping: 0.12 | _step_step: 354.88 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.352 | iteration 15720/ 143000 | elapsed time per iteration (ms): 62622.1 | learning rate: 5.854E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.408762E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 09:14:36,931] [INFO] [logging.py:60:log_dist] [Rank 0] step=15730, skipped=14, lr=[0.0005853454545619346, 0.0005853454545619346], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15730 loss: 2.4069 iter time (s): 62.511 samples/sec: 16.381 %comms: 0.002880191669821747 %optimizer_step 0.05794503519046833 %forward: 23.267612576492173 %backward: 62.40658611151409 [2025-04-05 09:14:36,932] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17879.29 | forward: 145448.59 | backward_microstep: 390119.96 | backward: 390110.92 | backward_inner_microstep: 390094.39 | backward_inner: 390087.94 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.71 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.21 | step: 362.22 | _step_clipping: 0.12 | _step_step: 360.31 | _step_zero_grad: 0.56 | _step_check_overflow: 0.61 samples/sec: 16.381 | iteration 15730/ 143000 | elapsed time per iteration (ms): 62511.8 | learning rate: 5.853E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.399780E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 09:25:03,840] [INFO] [logging.py:60:log_dist] [Rank 0] step=15740, skipped=14, lr=[0.0005853251004066244, 0.0005853251004066244], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15740 loss: 2.4009 iter time (s): 62.690 samples/sec: 16.334 %comms: 0.0028823088707294075 %optimizer_step 0.0607202015834828 %forward: 23.183624431487935 %backward: 62.227056444594844 [2025-04-05 09:25:03,840] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19794.32 | forward: 145338.73 | backward_microstep: 390112.88 | backward: 390102.99 | backward_inner_microstep: 390086.08 | backward_inner: 390079.56 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.74 | reduce_tied_grads: 0.28 | comms: 18.07 | reduce_grads: 0.19 | step: 380.66 | _step_clipping: 0.12 | _step_step: 378.77 | _step_zero_grad: 0.53 | _step_check_overflow: 0.65 samples/sec: 16.334 | iteration 15740/ 143000 | elapsed time per iteration (ms): 62690.9 | learning rate: 5.853E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.403947E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 09:35:30,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=15750, skipped=14, lr=[0.0005853047324802454, 0.0005853047324802454], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15750 loss: 2.4034 iter time (s): 62.693 samples/sec: 16.333 %comms: 0.002911788267921374 %optimizer_step 0.0610428209753392 %forward: 23.184624361205795 %backward: 62.2235830815954 [2025-04-05 09:35:30,781] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19766.69 | forward: 145352.33 | backward_microstep: 390110.64 | backward: 390100.90 | backward_inner_microstep: 390083.75 | backward_inner: 390077.13 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.97 | reduce_tied_grads: 0.32 | comms: 18.25 | reduce_grads: 0.21 | step: 382.70 | _step_clipping: 0.12 | _step_step: 380.82 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.333 | iteration 15750/ 143000 | elapsed time per iteration (ms): 62694.1 | learning rate: 5.853E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.402853E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 09:46:00,516] [INFO] [logging.py:60:log_dist] [Rank 0] step=15760, skipped=14, lr=[0.0005852843507837808, 0.0005852843507837808], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15760 loss: 2.4080 iter time (s): 62.973 samples/sec: 16.261 %comms: 0.0028967429658677845 %optimizer_step 0.05974985273060001 %forward: 23.11008964704863 %backward: 61.97751860542123 [2025-04-05 09:46:00,517] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22187.30 | forward: 145531.04 | backward_microstep: 390301.86 | backward: 390290.69 | backward_inner_microstep: 390273.27 | backward_inner: 390264.82 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.84 | reduce_tied_grads: 0.41 | comms: 18.24 | reduce_grads: 0.26 | step: 376.26 | _step_clipping: 0.15 | _step_step: 374.22 | _step_zero_grad: 0.54 | _step_check_overflow: 0.68 samples/sec: 16.261 | iteration 15760/ 143000 | elapsed time per iteration (ms): 62973.6 | learning rate: 5.853E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.407778E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 09:56:39,503] [INFO] [logging.py:60:log_dist] [Rank 0] step=15770, skipped=14, lr=[0.000585263955318214, 0.000585263955318214], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15770 loss: 2.4040 iter time (s): 63.898 samples/sec: 16.026 %comms: 0.002836780909051629 %optimizer_step 0.05695001475996942 %forward: 22.779678637647034 %backward: 61.07268832882985 [2025-04-05 09:56:39,504] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31479.75 | forward: 145557.79 | backward_microstep: 390253.88 | backward: 390242.80 | backward_inner_microstep: 390225.41 | backward_inner: 390218.74 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 18.13 | reduce_grads: 0.20 | step: 363.90 | _step_clipping: 0.15 | _step_step: 362.07 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.025 | iteration 15770/ 143000 | elapsed time per iteration (ms): 63898.7 | learning rate: 5.853E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.404074E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 10:07:08,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=15780, skipped=14, lr=[0.0005852435460845296, 0.0005852435460845296], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15780 loss: 2.4227 iter time (s): 62.850 samples/sec: 16.293 %comms: 0.0029286409674081464 %optimizer_step 0.05678569702628987 %forward: 23.126101084249058 %backward: 62.0787511643385 [2025-04-05 10:07:08,014] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21270.38 | forward: 145348.50 | backward_microstep: 390176.88 | backward: 390167.52 | backward_inner_microstep: 390149.95 | backward_inner: 390143.35 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.89 | reduce_tied_grads: 0.33 | comms: 18.41 | reduce_grads: 0.21 | step: 356.90 | _step_clipping: 0.14 | _step_step: 354.86 | _step_zero_grad: 0.56 | _step_check_overflow: 0.67 samples/sec: 16.292 | iteration 15780/ 143000 | elapsed time per iteration (ms): 62851.0 | learning rate: 5.852E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.405497E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 10:17:35,560] [INFO] [logging.py:60:log_dist] [Rank 0] step=15790, skipped=14, lr=[0.0005852231230837127, 0.0005852231230837127], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15790 loss: 2.4072 iter time (s): 62.754 samples/sec: 16.318 %comms: 0.0029007668866886187 %optimizer_step 0.05695466256327594 %forward: 23.162129801427515 %backward: 62.186752518779265 [2025-04-05 10:17:35,561] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20275.80 | forward: 145351.82 | backward_microstep: 390258.92 | backward: 390247.27 | backward_inner_microstep: 390230.36 | backward_inner: 390224.00 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.20 | reduce_grads: 0.20 | step: 357.41 | _step_clipping: 0.15 | _step_step: 355.45 | _step_zero_grad: 0.56 | _step_check_overflow: 0.66 samples/sec: 16.318 | iteration 15790/ 143000 | elapsed time per iteration (ms): 62754.7 | learning rate: 5.852E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.422837E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 10:28:11,736] [INFO] [logging.py:60:log_dist] [Rank 0] step=15800, skipped=14, lr=[0.0005852026863167488, 0.0005852026863167488], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15800 loss: 2.3880 iter time (s): 63.617 samples/sec: 16.096 %comms: 0.0028742394213562444 %optimizer_step 0.05740544918032256 %forward: 22.8845614161403 %backward: 61.35216033697094 [2025-04-05 10:28:11,737] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28572.00 | forward: 145584.61 | backward_microstep: 390315.75 | backward: 390303.76 | backward_inner_microstep: 390286.30 | backward_inner: 390279.55 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.84 | reduce_tied_grads: 0.35 | comms: 18.29 | reduce_grads: 0.21 | step: 365.20 | _step_clipping: 0.18 | _step_step: 363.07 | _step_zero_grad: 0.56 | _step_check_overflow: 0.73 samples/sec: 16.096 | iteration 15800/ 143000 | elapsed time per iteration (ms): 63617.6 | learning rate: 5.852E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.400928E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 10:38:42,813] [INFO] [logging.py:60:log_dist] [Rank 0] step=15810, skipped=14, lr=[0.0005851822357846245, 0.0005851822357846245], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15810 loss: 2.3997 iter time (s): 63.107 samples/sec: 16.226 %comms: 0.002899883310232489 %optimizer_step 0.05728162091394227 %forward: 23.061266091259604 %backward: 61.85532614962126 [2025-04-05 10:38:42,813] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23453.37 | forward: 145532.74 | backward_microstep: 390366.56 | backward: 390350.43 | backward_inner_microstep: 390332.80 | backward_inner: 390325.99 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.85 | reduce_tied_grads: 0.36 | comms: 18.30 | reduce_grads: 0.22 | step: 361.49 | _step_clipping: 0.12 | _step_step: 359.55 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.226 | iteration 15810/ 143000 | elapsed time per iteration (ms): 63107.6 | learning rate: 5.852E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.399606E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 10:49:10,196] [INFO] [logging.py:60:log_dist] [Rank 0] step=15820, skipped=14, lr=[0.0005851617714883265, 0.0005851617714883265], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15820 loss: 2.4033 iter time (s): 62.738 samples/sec: 16.322 %comms: 0.002940628595038878 %optimizer_step 0.059135896636970255 %forward: 23.180748302138603 %backward: 62.20912561623665 [2025-04-05 10:49:10,197] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19977.01 | forward: 145430.70 | backward_microstep: 390297.21 | backward: 390285.79 | backward_inner_microstep: 390267.54 | backward_inner: 390260.44 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.05 | reduce_tied_grads: 0.36 | comms: 18.45 | reduce_grads: 0.23 | step: 371.01 | _step_clipping: 0.14 | _step_step: 369.01 | _step_zero_grad: 0.58 | _step_check_overflow: 0.62 samples/sec: 16.322 | iteration 15820/ 143000 | elapsed time per iteration (ms): 62738.4 | learning rate: 5.852E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.404858E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 10:59:47,755] [INFO] [logging.py:60:log_dist] [Rank 0] step=15830, skipped=14, lr=[0.0005851412934288428, 0.0005851412934288428], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15830 loss: 2.4024 iter time (s): 63.755 samples/sec: 16.061 %comms: 0.0028553981854097105 %optimizer_step 0.056319920778219644 %forward: 22.861218439458888 %backward: 61.213118140385326 [2025-04-05 10:59:47,756] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29831.44 | forward: 145752.48 | backward_microstep: 390277.17 | backward: 390266.33 | backward_inner_microstep: 390248.56 | backward_inner: 390241.73 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.90 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.20 | step: 359.07 | _step_clipping: 0.13 | _step_step: 357.25 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.061 | iteration 15830/ 143000 | elapsed time per iteration (ms): 63755.9 | learning rate: 5.851E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.403149E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 11:10:13,068] [INFO] [logging.py:60:log_dist] [Rank 0] step=15840, skipped=14, lr=[0.0005851208016071617, 0.0005851208016071617], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15840 loss: 2.4078 iter time (s): 62.531 samples/sec: 16.376 %comms: 0.002901099737372196 %optimizer_step 0.05555856880732354 %forward: 23.233936975965293 %backward: 62.3898478923111 [2025-04-05 11:10:13,069] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18260.13 | forward: 145283.54 | backward_microstep: 390137.66 | backward: 390128.37 | backward_inner_microstep: 390111.33 | backward_inner: 390104.88 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.83 | reduce_tied_grads: 0.28 | comms: 18.14 | reduce_grads: 0.20 | step: 347.41 | _step_clipping: 0.11 | _step_step: 345.68 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.376 | iteration 15840/ 143000 | elapsed time per iteration (ms): 62531.3 | learning rate: 5.851E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.405414E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 11:20:39,705] [INFO] [logging.py:60:log_dist] [Rank 0] step=15850, skipped=14, lr=[0.0005851002960242722, 0.0005851002960242722], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15850 loss: 2.3843 iter time (s): 62.663 samples/sec: 16.341 %comms: 0.0028749189624393914 %optimizer_step 0.05613676777052763 %forward: 23.173503356124222 %backward: 62.24843106960067 [2025-04-05 11:20:39,706] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19706.93 | forward: 145212.46 | backward_microstep: 390076.96 | backward: 390068.24 | backward_inner_microstep: 390051.65 | backward_inner: 390045.08 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.20 | step: 351.77 | _step_clipping: 0.12 | _step_step: 350.03 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.341 | iteration 15850/ 143000 | elapsed time per iteration (ms): 62663.7 | learning rate: 5.851E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.396002E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 11:31:08,774] [INFO] [logging.py:60:log_dist] [Rank 0] step=15860, skipped=14, lr=[0.000585079776681164, 0.000585079776681164], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15860 loss: 2.3960 iter time (s): 62.906 samples/sec: 16.278 %comms: 0.0036019202780120316 %optimizer_step 0.05762136300296471 %forward: 23.138301146205308 %backward: 62.04750209930883 [2025-04-05 11:31:08,775] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21493.11 | forward: 145554.49 | backward_microstep: 390329.55 | backward: 390317.88 | backward_inner_microstep: 390300.15 | backward_inner: 390293.40 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 22.66 | reduce_grads: 0.21 | step: 362.47 | _step_clipping: 0.12 | _step_step: 360.65 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.278 | iteration 15860/ 143000 | elapsed time per iteration (ms): 62906.9 | learning rate: 5.851E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.404749E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 11:41:34,237] [INFO] [logging.py:60:log_dist] [Rank 0] step=15870, skipped=14, lr=[0.0005850592435788273, 0.0005850592435788273], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15870 loss: 2.4091 iter time (s): 62.546 samples/sec: 16.372 %comms: 0.002897013692804462 %optimizer_step 0.056542446356843995 %forward: 23.252987005365966 %backward: 62.39699262113623 [2025-04-05 11:41:34,237] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18049.24 | forward: 145437.42 | backward_microstep: 390277.59 | backward: 390266.33 | backward_inner_microstep: 390248.70 | backward_inner: 390240.06 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.12 | reduce_grads: 0.20 | step: 353.65 | _step_clipping: 0.11 | _step_step: 351.80 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.372 | iteration 15870/ 143000 | elapsed time per iteration (ms): 62546.3 | learning rate: 5.851E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.406807E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 11:52:04,025] [INFO] [logging.py:60:log_dist] [Rank 0] step=15880, skipped=14, lr=[0.0005850386967182535, 0.0005850386967182535], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15880 loss: 2.4066 iter time (s): 62.978 samples/sec: 16.260 %comms: 0.002882419569683844 %optimizer_step 0.0564076551221627 %forward: 23.104243809361062 %backward: 62.00572759357967 [2025-04-05 11:52:04,025] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22051.74 | forward: 145506.31 | backward_microstep: 390513.05 | backward: 390500.75 | backward_inner_microstep: 390483.02 | backward_inner: 390476.08 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.15 | reduce_grads: 0.21 | step: 355.25 | _step_clipping: 0.13 | _step_step: 353.41 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.259 | iteration 15880/ 143000 | elapsed time per iteration (ms): 62978.8 | learning rate: 5.850E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.409895E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 12:02:28,444] [INFO] [logging.py:60:log_dist] [Rank 0] step=15890, skipped=14, lr=[0.0005850181361004338, 0.0005850181361004338], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15890 loss: 2.4072 iter time (s): 62.441 samples/sec: 16.399 %comms: 0.0028847096608569916 %optimizer_step 0.05596806390359433 %forward: 23.304116953159866 %backward: 62.5142542076379 [2025-04-05 12:02:28,444] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16852.83 | forward: 145514.11 | backward_microstep: 390356.24 | backward: 390347.60 | backward_inner_microstep: 390330.59 | backward_inner: 390324.00 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.80 | reduce_tied_grads: 0.32 | comms: 18.01 | reduce_grads: 0.20 | step: 349.47 | _step_clipping: 0.12 | _step_step: 347.84 | _step_zero_grad: 0.45 | _step_check_overflow: 0.51 samples/sec: 16.399 | iteration 15890/ 143000 | elapsed time per iteration (ms): 62441.9 | learning rate: 5.850E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.400761E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 12:12:53,445] [INFO] [logging.py:60:log_dist] [Rank 0] step=15900, skipped=14, lr=[0.0005849975617263609, 0.0005849975617263609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15900 loss: 2.4037 iter time (s): 62.499 samples/sec: 16.384 %comms: 0.0028939676134120945 %optimizer_step 0.05735280526946874 %forward: 23.287434659081363 %backward: 62.485281989990696 [2025-04-05 12:12:53,445] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17158.24 | forward: 145545.27 | backward_microstep: 390539.70 | backward: 390529.80 | backward_inner_microstep: 390512.89 | backward_inner: 390506.23 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.20 | step: 358.45 | _step_clipping: 0.11 | _step_step: 356.76 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.384 | iteration 15900/ 143000 | elapsed time per iteration (ms): 62500.1 | learning rate: 5.850E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.398398E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 12:23:32,109] [INFO] [logging.py:60:log_dist] [Rank 0] step=15910, skipped=14, lr=[0.0005849769735970277, 0.0005849769735970277], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15910 loss: 2.4019 iter time (s): 63.866 samples/sec: 16.034 %comms: 0.0028376163995663855 %optimizer_step 0.05527315530017343 %forward: 22.817644652671934 %backward: 61.12585910663857 [2025-04-05 12:23:32,109] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30838.76 | forward: 145726.78 | backward_microstep: 390396.44 | backward: 390385.38 | backward_inner_microstep: 390367.52 | backward_inner: 390360.56 | backward_allreduce_microstep: 8.59 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.12 | reduce_grads: 0.22 | step: 353.01 | _step_clipping: 0.12 | _step_step: 351.29 | _step_zero_grad: 0.53 | _step_check_overflow: 0.46 samples/sec: 16.033 | iteration 15910/ 143000 | elapsed time per iteration (ms): 63866.4 | learning rate: 5.850E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.403093E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 12:34:08,622] [INFO] [logging.py:60:log_dist] [Rank 0] step=15920, skipped=14, lr=[0.0005849563717134279, 0.0005849563717134279], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15920 loss: 2.4180 iter time (s): 63.651 samples/sec: 16.088 %comms: 0.002823604889335228 %optimizer_step 0.0548947716095017 %forward: 23.4836571539838 %backward: 61.35732211541753 [2025-04-05 12:34:08,623] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24738.61 | forward: 149475.34 | backward_microstep: 390556.08 | backward: 390544.23 | backward_inner_microstep: 390526.91 | backward_inner: 390520.02 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.85 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.19 | step: 349.41 | _step_clipping: 0.12 | _step_step: 347.53 | _step_zero_grad: 0.53 | _step_check_overflow: 0.63 samples/sec: 16.088 | iteration 15920/ 143000 | elapsed time per iteration (ms): 63651.4 | learning rate: 5.850E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.401205E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 12:44:43,745] [INFO] [logging.py:60:log_dist] [Rank 0] step=15930, skipped=14, lr=[0.0005849357560765557, 0.0005849357560765557], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15930 loss: 2.4159 iter time (s): 63.512 samples/sec: 16.123 %comms: 0.0029474779756064624 %optimizer_step 0.05801445403978114 %forward: 23.022408596505166 %backward: 61.57088384817079 [2025-04-05 12:44:43,746] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25946.83 | forward: 146219.06 | backward_microstep: 391065.84 | backward: 391046.69 | backward_inner_microstep: 391028.24 | backward_inner: 391020.83 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.72 | reduce_grads: 0.21 | step: 368.46 | _step_clipping: 0.14 | _step_step: 366.40 | _step_zero_grad: 0.58 | _step_check_overflow: 0.67 samples/sec: 16.123 | iteration 15930/ 143000 | elapsed time per iteration (ms): 63512.3 | learning rate: 5.849E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.410538E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 12:55:17,843] [INFO] [logging.py:60:log_dist] [Rank 0] step=15940, skipped=14, lr=[0.0005849151266874064, 0.0005849151266874064], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15940 loss: 2.4109 iter time (s): 63.409 samples/sec: 16.149 %comms: 0.0028672635051915276 %optimizer_step 0.05771363734303955 %forward: 22.976192596785943 %backward: 61.58054794023352 [2025-04-05 12:55:17,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26158.99 | forward: 145690.18 | backward_microstep: 390488.03 | backward: 390477.27 | backward_inner_microstep: 390459.91 | backward_inner: 390452.97 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.91 | reduce_tied_grads: 0.34 | comms: 18.18 | reduce_grads: 0.21 | step: 365.96 | _step_clipping: 0.13 | _step_step: 363.97 | _step_zero_grad: 0.54 | _step_check_overflow: 0.71 samples/sec: 16.149 | iteration 15940/ 143000 | elapsed time per iteration (ms): 63409.8 | learning rate: 5.849E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.409186E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:05:58,430] [INFO] [logging.py:60:log_dist] [Rank 0] step=15950, skipped=14, lr=[0.0005848944835469753, 0.0005848944835469753], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15950 loss: 2.4038 iter time (s): 64.058 samples/sec: 15.985 %comms: 0.0028594347542684577 %optimizer_step 0.05741789054031059 %forward: 22.753852849477553 %backward: 60.956933029900185 [2025-04-05 13:05:58,431] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32573.26 | forward: 145756.77 | backward_microstep: 390492.92 | backward: 390478.28 | backward_inner_microstep: 390460.02 | backward_inner: 390452.97 | backward_allreduce_microstep: 8.73 | backward_allreduce: 2.99 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.22 | step: 367.81 | _step_clipping: 0.15 | _step_step: 365.79 | _step_zero_grad: 0.56 | _step_check_overflow: 0.64 samples/sec: 15.985 | iteration 15950/ 143000 | elapsed time per iteration (ms): 64058.7 | learning rate: 5.849E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.402319E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:16:35,383] [INFO] [logging.py:60:log_dist] [Rank 0] step=15960, skipped=14, lr=[0.0005848738266562591, 0.0005848738266562591], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15960 loss: 2.4367 iter time (s): 63.695 samples/sec: 16.077 %comms: 0.0028692364418846867 %optimizer_step 0.06101112957512385 %forward: 22.86781676257369 %backward: 61.28508129703698 [2025-04-05 13:16:35,384] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29144.49 | forward: 145655.74 | backward_microstep: 390367.34 | backward: 390353.14 | backward_inner_microstep: 390334.98 | backward_inner: 390327.85 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.97 | reduce_tied_grads: 0.36 | comms: 18.28 | reduce_grads: 0.42 | step: 388.61 | _step_clipping: 0.15 | _step_step: 386.44 | _step_zero_grad: 0.61 | _step_check_overflow: 0.74 samples/sec: 16.077 | iteration 15960/ 143000 | elapsed time per iteration (ms): 63695.3 | learning rate: 5.849E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.408330E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:27:23,038] [INFO] [logging.py:60:log_dist] [Rank 0] step=15970, skipped=14, lr=[0.0005848531560162546, 0.0005848531560162546], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15970 loss: 2.4239 iter time (s): 64.765 samples/sec: 15.811 %comms: 0.0027906458109030995 %optimizer_step 0.05523208080128193 %forward: 22.521969083601075 %backward: 60.26671636175415 [2025-04-05 13:27:23,038] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39717.25 | forward: 145863.06 | backward_microstep: 390329.21 | backward: 390316.12 | backward_inner_microstep: 390298.00 | backward_inner: 390291.11 | backward_allreduce_microstep: 8.72 | backward_allreduce: 3.18 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.22 | step: 357.71 | _step_clipping: 0.12 | _step_step: 355.80 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 15.811 | iteration 15970/ 143000 | elapsed time per iteration (ms): 64765.4 | learning rate: 5.849E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.415053E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:37:51,311] [INFO] [logging.py:60:log_dist] [Rank 0] step=15980, skipped=14, lr=[0.0005848324716279593, 0.0005848324716279593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15980 loss: 2.4018 iter time (s): 62.827 samples/sec: 16.299 %comms: 0.0028668996382469935 %optimizer_step 0.059169655767299566 %forward: 23.151741401088177 %backward: 62.12831304312978 [2025-04-05 13:37:51,312] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20719.12 | forward: 145454.95 | backward_microstep: 390344.98 | backward: 390332.21 | backward_inner_microstep: 390315.37 | backward_inner: 390308.76 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.77 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.19 | step: 371.74 | _step_clipping: 0.12 | _step_step: 370.02 | _step_zero_grad: 0.53 | _step_check_overflow: 0.47 samples/sec: 16.299 | iteration 15980/ 143000 | elapsed time per iteration (ms): 62827.3 | learning rate: 5.848E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.402849E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:48:26,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=15990, skipped=14, lr=[0.0005848117734923719, 0.0005848117734923719], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 15990 loss: 2.4094 iter time (s): 63.519 samples/sec: 16.121 %comms: 0.0028394713421515615 %optimizer_step 0.05829554889454776 %forward: 22.931456490908317 %backward: 61.456460035930995 [2025-04-05 13:48:26,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27377.57 | forward: 145659.04 | backward_microstep: 390380.21 | backward: 390367.22 | backward_inner_microstep: 390349.03 | backward_inner: 390340.14 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.22 | step: 370.29 | _step_clipping: 0.13 | _step_step: 368.45 | _step_zero_grad: 0.59 | _step_check_overflow: 0.50 samples/sec: 16.121 | iteration 15990/ 143000 | elapsed time per iteration (ms): 63519.9 | learning rate: 5.848E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.403528E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:58:58,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=16000, skipped=14, lr=[0.000584791061610491, 0.000584791061610491], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16000 loss: 2.4043 iter time (s): 63.168 samples/sec: 16.211 %comms: 0.002935600206495081 %optimizer_step 0.0581934108665464 %forward: 23.095425298257904 %backward: 61.83485190770528 [2025-04-05 13:58:58,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23406.01 | forward: 145890.13 | backward_microstep: 390617.54 | backward: 390600.94 | backward_inner_microstep: 390579.13 | backward_inner: 390571.81 | backward_allreduce_microstep: 10.22 | backward_allreduce: 4.70 | reduce_tied_grads: 0.34 | comms: 18.54 | reduce_grads: 0.22 | step: 367.60 | _step_clipping: 0.13 | _step_step: 365.47 | _step_zero_grad: 0.59 | _step_check_overflow: 0.74 samples/sec: 16.210 | iteration 16000/ 143000 | elapsed time per iteration (ms): 63169.0 | learning rate: 5.848E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.411326E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 13:59:01,082] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step16000/mp_rank_00_model_states.pt [2025-04-05 13:59:14,950] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-05 13:59:14,956] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step16000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-05 14:09:46,875] [INFO] [logging.py:60:log_dist] [Rank 0] step=16010, skipped=14, lr=[0.0005847703359833164, 0.0005847703359833164], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16010 loss: 2.4116 iter time (s): 63.190 samples/sec: 16.205 %comms: 0.002913865816327105 %optimizer_step 0.057336513724797734 %forward: 23.030020908772812 %backward: 61.763481779195864 [2025-04-05 14:09:46,875] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24359.48 | forward: 145527.57 | backward_microstep: 390296.43 | backward: 390285.78 | backward_inner_microstep: 390268.92 | backward_inner: 390262.46 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.78 | reduce_tied_grads: 0.34 | comms: 18.41 | reduce_grads: 0.20 | step: 362.31 | _step_clipping: 0.14 | _step_step: 360.42 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 15.786 | iteration 16010/ 143000 | elapsed time per iteration (ms): 64867.4 | learning rate: 5.848E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.404251E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 14:20:19,090] [INFO] [logging.py:60:log_dist] [Rank 0] step=16020, skipped=14, lr=[0.0005847495966118485, 0.0005847495966118485], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16020 loss: 2.3900 iter time (s): 63.221 samples/sec: 16.197 %comms: 0.002907216626370755 %optimizer_step 0.05739367554274899 %forward: 23.04953858853675 %backward: 61.761726776707924 [2025-04-05 14:20:19,091] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24252.73 | forward: 145721.28 | backward_microstep: 390477.10 | backward: 390463.26 | backward_inner_microstep: 390444.01 | backward_inner: 390437.13 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.85 | reduce_tied_grads: 0.33 | comms: 18.38 | reduce_grads: 0.21 | step: 362.85 | _step_clipping: 0.16 | _step_step: 360.95 | _step_zero_grad: 0.53 | _step_check_overflow: 0.57 samples/sec: 16.197 | iteration 16020/ 143000 | elapsed time per iteration (ms): 63221.5 | learning rate: 5.847E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.393682E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 14:30:53,512] [INFO] [logging.py:60:log_dist] [Rank 0] step=16030, skipped=14, lr=[0.0005847288434970883, 0.0005847288434970883], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16030 loss: 2.4038 iter time (s): 63.442 samples/sec: 16.141 %comms: 0.002881958004350131 %optimizer_step 0.05876891378093737 %forward: 22.944215798369676 %backward: 61.52808970887412 [2025-04-05 14:30:53,513] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26744.61 | forward: 145561.80 | backward_microstep: 390356.27 | backward: 390344.11 | backward_inner_microstep: 390326.29 | backward_inner: 390319.36 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.94 | reduce_tied_grads: 0.36 | comms: 18.28 | reduce_grads: 0.22 | step: 372.84 | _step_clipping: 0.13 | _step_step: 370.77 | _step_zero_grad: 0.55 | _step_check_overflow: 0.76 samples/sec: 16.141 | iteration 16030/ 143000 | elapsed time per iteration (ms): 63442.3 | learning rate: 5.847E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.405249E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 14:41:31,543] [INFO] [logging.py:60:log_dist] [Rank 0] step=16040, skipped=14, lr=[0.0005847080766400374, 0.0005847080766400374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16040 loss: 2.3833 iter time (s): 63.802 samples/sec: 16.050 %comms: 0.0028366252425325324 %optimizer_step 0.055511833000136 %forward: 22.82052507931694 %backward: 61.18217425341332 [2025-04-05 14:41:31,544] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30350.34 | forward: 145600.46 | backward_microstep: 390370.88 | backward: 390357.05 | backward_inner_microstep: 390339.36 | backward_inner: 390332.49 | backward_allreduce_microstep: 8.49 | backward_allreduce: 3.00 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.20 | step: 354.18 | _step_clipping: 0.14 | _step_step: 352.25 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 16.049 | iteration 16040/ 143000 | elapsed time per iteration (ms): 63803.1 | learning rate: 5.847E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.405718E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 14:52:03,603] [INFO] [logging.py:60:log_dist] [Rank 0] step=16050, skipped=14, lr=[0.0005846872960416979, 0.0005846872960416979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16050 loss: 2.4067 iter time (s): 63.205 samples/sec: 16.201 %comms: 0.0028604387077150587 %optimizer_step 0.05722529605370879 %forward: 23.025695204239636 %backward: 61.76483427433057 [2025-04-05 14:52:03,604] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24344.73 | forward: 145534.84 | backward_microstep: 390398.32 | backward: 390387.15 | backward_inner_microstep: 390369.87 | backward_inner: 390363.04 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.86 | reduce_tied_grads: 0.35 | comms: 18.08 | reduce_grads: 0.21 | step: 361.69 | _step_clipping: 0.15 | _step_step: 359.74 | _step_zero_grad: 0.50 | _step_check_overflow: 0.71 samples/sec: 16.201 | iteration 16050/ 143000 | elapsed time per iteration (ms): 63206.0 | learning rate: 5.847E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.406606E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 15:02:40,880] [INFO] [logging.py:60:log_dist] [Rank 0] step=16060, skipped=14, lr=[0.0005846665017030729, 0.0005846665017030729], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16060 loss: 2.4300 iter time (s): 63.727 samples/sec: 16.069 %comms: 0.002874547180533542 %optimizer_step 0.057246191448150976 %forward: 22.8568002468724 %backward: 61.277967931934164 [2025-04-05 15:02:40,881] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29307.59 | forward: 145659.74 | backward_microstep: 390519.69 | backward: 390506.67 | backward_inner_microstep: 390488.73 | backward_inner: 390481.73 | backward_allreduce_microstep: 8.58 | backward_allreduce: 3.09 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.24 | step: 364.81 | _step_clipping: 0.14 | _step_step: 362.93 | _step_zero_grad: 0.56 | _step_check_overflow: 0.56 samples/sec: 16.068 | iteration 16060/ 143000 | elapsed time per iteration (ms): 63727.7 | learning rate: 5.847E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.410786E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 15:13:14,542] [INFO] [logging.py:60:log_dist] [Rank 0] step=16070, skipped=14, lr=[0.0005846456936251662, 0.0005846456936251662], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16070 loss: 2.4076 iter time (s): 63.366 samples/sec: 16.160 %comms: 0.0028925672524980936 %optimizer_step 0.05828374203300892 %forward: 22.990297940684197 %backward: 61.650962500765594 [2025-04-05 15:13:14,542] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25524.02 | forward: 145679.24 | backward_microstep: 390668.14 | backward: 390654.58 | backward_inner_microstep: 390637.12 | backward_inner: 390630.28 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.81 | reduce_tied_grads: 0.32 | comms: 18.33 | reduce_grads: 0.19 | step: 369.32 | _step_clipping: 0.14 | _step_step: 367.40 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 16.160 | iteration 16070/ 143000 | elapsed time per iteration (ms): 63366.1 | learning rate: 5.846E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.416481E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 15:23:49,497] [INFO] [logging.py:60:log_dist] [Rank 0] step=16080, skipped=14, lr=[0.0005846248718089817, 0.0005846248718089817], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16080 loss: 2.3982 iter time (s): 63.495 samples/sec: 16.127 %comms: 0.002876682497051712 %optimizer_step 0.057929193315013734 %forward: 22.93348718165716 %backward: 61.53377046949117 [2025-04-05 15:23:49,498] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26814.94 | forward: 145616.10 | backward_microstep: 390722.61 | backward: 390708.47 | backward_inner_microstep: 390690.92 | backward_inner: 390684.03 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.86 | reduce_tied_grads: 0.58 | comms: 18.27 | reduce_grads: 0.23 | step: 367.82 | _step_clipping: 0.16 | _step_step: 365.89 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.127 | iteration 16080/ 143000 | elapsed time per iteration (ms): 63495.6 | learning rate: 5.846E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.402733E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 15:34:27,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=16090, skipped=14, lr=[0.0005846040362555248, 0.0005846040362555248], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16090 loss: 2.4200 iter time (s): 63.801 samples/sec: 16.050 %comms: 0.002837670920635178 %optimizer_step 0.056273485210616044 %forward: 22.826952165818213 %backward: 61.20464972141202 [2025-04-05 15:34:27,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30059.07 | forward: 145637.67 | backward_microstep: 390501.64 | backward: 390490.25 | backward_inner_microstep: 390473.35 | backward_inner: 390466.86 | backward_allreduce_microstep: 8.01 | backward_allreduce: 2.78 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.20 | step: 359.03 | _step_clipping: 0.13 | _step_step: 357.12 | _step_zero_grad: 0.49 | _step_check_overflow: 0.70 samples/sec: 16.050 | iteration 16090/ 143000 | elapsed time per iteration (ms): 63801.3 | learning rate: 5.846E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.408526E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 15:44:58,025] [INFO] [logging.py:60:log_dist] [Rank 0] step=16100, skipped=14, lr=[0.0005845831869658008, 0.0005845831869658008], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16100 loss: 2.4136 iter time (s): 63.051 samples/sec: 16.241 %comms: 0.002857620010004661 %optimizer_step 0.056704307715958364 %forward: 23.099641967979355 %backward: 61.96423801336874 [2025-04-05 15:44:58,026] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22317.11 | forward: 145645.15 | backward_microstep: 390703.29 | backward: 390689.64 | backward_inner_microstep: 390672.36 | backward_inner: 390665.52 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 357.53 | _step_clipping: 0.13 | _step_step: 355.77 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.241 | iteration 16100/ 143000 | elapsed time per iteration (ms): 63051.5 | learning rate: 5.846E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.412756E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 15:55:35,071] [INFO] [logging.py:60:log_dist] [Rank 0] step=16110, skipped=14, lr=[0.0005845623239408161, 0.0005845623239408161], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16110 loss: 2.3754 iter time (s): 63.704 samples/sec: 16.074 %comms: 0.0028340460618228 %optimizer_step 0.0564151966704143 %forward: 22.871835313749518 %backward: 61.28667802032886 [2025-04-05 15:55:35,072] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29126.28 | forward: 145702.75 | backward_microstep: 390431.41 | backward: 390420.69 | backward_inner_microstep: 390401.67 | backward_inner: 390394.91 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.85 | reduce_tied_grads: 0.35 | comms: 18.05 | reduce_grads: 0.21 | step: 359.39 | _step_clipping: 0.13 | _step_step: 357.49 | _step_zero_grad: 0.52 | _step_check_overflow: 0.64 samples/sec: 16.074 | iteration 16110/ 143000 | elapsed time per iteration (ms): 63704.6 | learning rate: 5.846E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.405362E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 16:06:10,496] [INFO] [logging.py:60:log_dist] [Rank 0] step=16120, skipped=14, lr=[0.0005845414471815776, 0.0005845414471815776], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16120 loss: 2.4024 iter time (s): 63.542 samples/sec: 16.115 %comms: 0.002829568456211888 %optimizer_step 0.05619352906773197 %forward: 22.915881422991838 %backward: 61.43355384394846 [2025-04-05 16:06:10,497] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27654.67 | forward: 145611.92 | backward_microstep: 390370.95 | backward: 390360.61 | backward_inner_microstep: 390343.81 | backward_inner: 390337.17 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.21 | step: 357.06 | _step_clipping: 0.13 | _step_step: 355.25 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.115 | iteration 16120/ 143000 | elapsed time per iteration (ms): 63542.5 | learning rate: 5.845E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.405285E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 16:16:43,504] [INFO] [logging.py:60:log_dist] [Rank 0] step=16130, skipped=14, lr=[0.000584520556689093, 0.000584520556689093], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16130 loss: 2.3953 iter time (s): 63.300 samples/sec: 16.177 %comms: 0.002905798479943968 %optimizer_step 0.056558343496945655 %forward: 23.01642685971083 %backward: 61.68546195822766 [2025-04-05 16:16:43,505] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25050.34 | forward: 145694.38 | backward_microstep: 390481.11 | backward: 390470.05 | backward_inner_microstep: 390452.57 | backward_inner: 390445.78 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.85 | reduce_tied_grads: 0.31 | comms: 18.39 | reduce_grads: 0.21 | step: 358.02 | _step_clipping: 0.13 | _step_step: 356.07 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.177 | iteration 16130/ 143000 | elapsed time per iteration (ms): 63300.8 | learning rate: 5.845E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.400470E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 16:27:15,195] [INFO] [logging.py:60:log_dist] [Rank 0] step=16140, skipped=14, lr=[0.0005844996524643704, 0.0005844996524643704], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16140 loss: 2.4017 iter time (s): 63.169 samples/sec: 16.211 %comms: 0.00284138551549772 %optimizer_step 0.05570047113185266 %forward: 23.028211063068003 %backward: 61.79126394528494 [2025-04-05 16:27:15,196] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24142.78 | forward: 145465.93 | backward_microstep: 390337.71 | backward: 390326.61 | backward_inner_microstep: 390309.69 | backward_inner: 390301.45 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.83 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.21 | step: 351.85 | _step_clipping: 0.13 | _step_step: 350.14 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.210 | iteration 16140/ 143000 | elapsed time per iteration (ms): 63169.1 | learning rate: 5.845E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.401129E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 16:37:49,354] [INFO] [logging.py:60:log_dist] [Rank 0] step=16150, skipped=14, lr=[0.000584478734508419, 0.000584478734508419], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16150 loss: 2.4202 iter time (s): 63.415 samples/sec: 16.148 %comms: 0.002838528065082842 %optimizer_step 0.055950583856630924 %forward: 23.53648817425841 %backward: 61.57487765270125 [2025-04-05 16:37:49,354] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22587.68 | forward: 149257.28 | backward_microstep: 390492.00 | backward: 390478.76 | backward_inner_microstep: 390461.36 | backward_inner: 390454.69 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.86 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.20 | step: 354.81 | _step_clipping: 0.13 | _step_step: 352.90 | _step_zero_grad: 0.50 | _step_check_overflow: 0.46 samples/sec: 16.147 | iteration 16150/ 143000 | elapsed time per iteration (ms): 63415.8 | learning rate: 5.845E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.404838E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 16:48:25,763] [INFO] [logging.py:60:log_dist] [Rank 0] step=16160, skipped=14, lr=[0.0005844578028222482, 0.0005844578028222482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16160 loss: 2.4078 iter time (s): 63.640 samples/sec: 16.090 %comms: 0.0028661056648453713 %optimizer_step 0.057891295867308915 %forward: 22.891741793174536 %backward: 61.34566391302523 [2025-04-05 16:48:25,763] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28474.54 | forward: 145683.67 | backward_microstep: 390416.38 | backward: 390405.49 | backward_inner_microstep: 390388.20 | backward_inner: 390381.35 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.82 | reduce_tied_grads: 0.33 | comms: 18.24 | reduce_grads: 0.22 | step: 368.42 | _step_clipping: 0.14 | _step_step: 366.53 | _step_zero_grad: 0.57 | _step_check_overflow: 0.55 samples/sec: 16.090 | iteration 16160/ 143000 | elapsed time per iteration (ms): 63640.9 | learning rate: 5.845E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.413147E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 16:59:10,979] [INFO] [logging.py:60:log_dist] [Rank 0] step=16170, skipped=14, lr=[0.0005844368574068682, 0.0005844368574068682], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16170 loss: 2.4149 iter time (s): 64.521 samples/sec: 15.871 %comms: 0.0028017070796062748 %optimizer_step 0.055104870560234984 %forward: 22.58483162748874 %backward: 60.495305872076585 [2025-04-05 16:59:10,979] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37337.70 | forward: 145719.61 | backward_microstep: 390332.21 | backward: 390321.82 | backward_inner_microstep: 390304.68 | backward_inner: 390298.12 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.97 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.19 | step: 355.54 | _step_clipping: 0.13 | _step_step: 353.73 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 15.871 | iteration 16170/ 143000 | elapsed time per iteration (ms): 64521.6 | learning rate: 5.844E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.407619E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 17:09:40,223] [INFO] [logging.py:60:log_dist] [Rank 0] step=16180, skipped=14, lr=[0.00058441589826329, 0.00058441589826329], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16180 loss: 2.3889 iter time (s): 62.924 samples/sec: 16.274 %comms: 0.002879677197404806 %optimizer_step 0.05784975606647418 %forward: 23.120463838234873 %backward: 62.06113727657899 [2025-04-05 17:09:40,224] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21365.11 | forward: 145482.96 | backward_microstep: 390523.36 | backward: 390512.85 | backward_inner_microstep: 390495.58 | backward_inner: 390488.58 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.78 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.20 | step: 364.01 | _step_clipping: 0.14 | _step_step: 362.24 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.273 | iteration 16180/ 143000 | elapsed time per iteration (ms): 62924.5 | learning rate: 5.844E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.399281E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 17:20:10,468] [INFO] [logging.py:60:log_dist] [Rank 0] step=16190, skipped=14, lr=[0.0005843949253925253, 0.0005843949253925253], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16190 loss: 2.4021 iter time (s): 63.024 samples/sec: 16.248 %comms: 0.0028884645876624837 %optimizer_step 0.057168779836852965 %forward: 23.08594687702866 %backward: 61.962806246947885 [2025-04-05 17:20:10,468] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22358.55 | forward: 145496.50 | backward_microstep: 390528.17 | backward: 390513.38 | backward_inner_microstep: 390495.35 | backward_inner: 390488.22 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.20 | reduce_grads: 0.28 | step: 360.30 | _step_clipping: 0.14 | _step_step: 358.38 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.248 | iteration 16190/ 143000 | elapsed time per iteration (ms): 63024.4 | learning rate: 5.844E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.401340E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 17:30:43,809] [INFO] [logging.py:60:log_dist] [Rank 0] step=16200, skipped=14, lr=[0.0005843739387955862, 0.0005843739387955862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16200 loss: 2.3981 iter time (s): 63.334 samples/sec: 16.168 %comms: 0.0028666230247099606 %optimizer_step 0.05748330368925119 %forward: 22.99408969558603 %backward: 61.67382687980456 [2025-04-05 17:30:43,810] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25195.93 | forward: 145629.69 | backward_microstep: 390617.17 | backward: 390602.13 | backward_inner_microstep: 390583.84 | backward_inner: 390576.73 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.99 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.21 | step: 364.06 | _step_clipping: 0.14 | _step_step: 362.04 | _step_zero_grad: 0.57 | _step_check_overflow: 0.64 samples/sec: 16.168 | iteration 16200/ 143000 | elapsed time per iteration (ms): 63334.2 | learning rate: 5.844E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.397038E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 17:41:17,108] [INFO] [logging.py:60:log_dist] [Rank 0] step=16210, skipped=14, lr=[0.0005843529384734856, 0.0005843529384734856], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16210 loss: 2.4194 iter time (s): 63.329 samples/sec: 16.169 %comms: 0.0028315425040499093 %optimizer_step 0.05504811224112207 %forward: 22.990536220858147 %backward: 61.63456935115534 [2025-04-05 17:41:17,108] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25513.79 | forward: 145597.28 | backward_microstep: 390336.12 | backward: 390326.93 | backward_inner_microstep: 390310.31 | backward_inner: 390302.12 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.75 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.19 | step: 348.62 | _step_clipping: 0.13 | _step_step: 346.71 | _step_zero_grad: 0.46 | _step_check_overflow: 0.74 samples/sec: 16.169 | iteration 16210/ 143000 | elapsed time per iteration (ms): 63329.8 | learning rate: 5.844E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.395997E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 17:51:49,529] [INFO] [logging.py:60:log_dist] [Rank 0] step=16220, skipped=14, lr=[0.000584331924427237, 0.000584331924427237], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16220 loss: 2.4044 iter time (s): 63.242 samples/sec: 16.192 %comms: 0.0029194977378593674 %optimizer_step 0.05665746785046705 %forward: 23.025148947701346 %backward: 61.75751775540177 [2025-04-05 17:51:49,530] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24359.00 | forward: 145614.74 | backward_microstep: 390575.90 | backward: 390564.47 | backward_inner_microstep: 390547.48 | backward_inner: 390540.80 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 18.46 | reduce_grads: 0.20 | step: 358.31 | _step_clipping: 0.14 | _step_step: 356.39 | _step_zero_grad: 0.53 | _step_check_overflow: 0.61 samples/sec: 16.192 | iteration 16220/ 143000 | elapsed time per iteration (ms): 63242.2 | learning rate: 5.843E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.397842E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 18:02:25,897] [INFO] [logging.py:60:log_dist] [Rank 0] step=16230, skipped=14, lr=[0.000584310896657855, 0.000584310896657855], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16230 loss: 2.4177 iter time (s): 63.636 samples/sec: 16.091 %comms: 0.002994009070761349 %optimizer_step 0.058670483808119976 %forward: 22.909792342150322 %backward: 61.40395095975519 [2025-04-05 18:02:25,898] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27888.94 | forward: 145789.27 | backward_microstep: 390768.93 | backward: 390751.57 | backward_inner_microstep: 390732.51 | backward_inner: 390723.54 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.90 | reduce_tied_grads: 0.38 | comms: 19.05 | reduce_grads: 0.21 | step: 373.36 | _step_clipping: 0.14 | _step_step: 371.25 | _step_zero_grad: 0.55 | _step_check_overflow: 0.75 samples/sec: 16.091 | iteration 16230/ 143000 | elapsed time per iteration (ms): 63636.8 | learning rate: 5.843E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.412425E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 18:13:02,334] [INFO] [logging.py:60:log_dist] [Rank 0] step=16240, skipped=14, lr=[0.0005842898551663541, 0.0005842898551663541], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16240 loss: 2.4029 iter time (s): 63.643 samples/sec: 16.090 %comms: 0.0028612611860297666 %optimizer_step 0.05624713227664482 %forward: 22.89769488065747 %backward: 61.39473969811785 [2025-04-05 18:13:02,335] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28028.05 | forward: 145727.88 | backward_microstep: 390749.47 | backward: 390734.75 | backward_inner_microstep: 390717.63 | backward_inner: 390710.55 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.76 | reduce_tied_grads: 0.34 | comms: 18.21 | reduce_grads: 0.20 | step: 357.97 | _step_clipping: 0.14 | _step_step: 356.09 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.090 | iteration 16240/ 143000 | elapsed time per iteration (ms): 63643.6 | learning rate: 5.843E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.395895E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 18:23:41,625] [INFO] [logging.py:60:log_dist] [Rank 0] step=16250, skipped=14, lr=[0.0005842687999537499, 0.0005842687999537499], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16250 loss: 2.3894 iter time (s): 63.929 samples/sec: 16.018 %comms: 0.002822861054990494 %optimizer_step 0.05609538946916003 %forward: 22.790601314866755 %backward: 61.114005113573775 [2025-04-05 18:23:41,626] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30921.07 | forward: 145697.00 | backward_microstep: 390709.83 | backward: 390692.96 | backward_inner_microstep: 390674.92 | backward_inner: 390667.77 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.20 | step: 358.61 | _step_clipping: 0.16 | _step_step: 356.63 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 16.018 | iteration 16250/ 143000 | elapsed time per iteration (ms): 63929.2 | learning rate: 5.843E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.399649E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 18:34:12,915] [INFO] [logging.py:60:log_dist] [Rank 0] step=16260, skipped=14, lr=[0.0005842477310210589, 0.0005842477310210589], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16260 loss: 2.3900 iter time (s): 63.128 samples/sec: 16.221 %comms: 0.0028989403756187536 %optimizer_step 0.058138940751811016 %forward: 23.05424719921809 %backward: 61.878653676375286 [2025-04-05 18:34:12,916] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23154.64 | forward: 145537.67 | backward_microstep: 390642.85 | backward: 390629.75 | backward_inner_microstep: 390612.56 | backward_inner: 390605.80 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.30 | reduce_grads: 0.21 | step: 367.02 | _step_clipping: 0.12 | _step_step: 365.14 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.221 | iteration 16260/ 143000 | elapsed time per iteration (ms): 63129.0 | learning rate: 5.842E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.399238E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 18:44:47,169] [INFO] [logging.py:60:log_dist] [Rank 0] step=16270, skipped=14, lr=[0.0005842266483692977, 0.0005842266483692977], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16270 loss: 2.3723 iter time (s): 63.425 samples/sec: 16.145 %comms: 0.0029264381545573803 %optimizer_step 0.05674820309028802 %forward: 22.94869906024474 %backward: 61.58432535790109 [2025-04-05 18:44:47,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26164.38 | forward: 145551.75 | backward_microstep: 390609.65 | backward: 390597.58 | backward_inner_microstep: 390579.93 | backward_inner: 390572.89 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.56 | reduce_grads: 0.21 | step: 359.92 | _step_clipping: 0.14 | _step_step: 357.98 | _step_zero_grad: 0.58 | _step_check_overflow: 0.57 samples/sec: 16.145 | iteration 16270/ 143000 | elapsed time per iteration (ms): 63425.4 | learning rate: 5.842E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.400665E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 18:55:29,523] [INFO] [logging.py:60:log_dist] [Rank 0] step=16280, skipped=14, lr=[0.000584205551999484, 0.000584205551999484], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16280 loss: 2.3982 iter time (s): 64.235 samples/sec: 15.942 %comms: 0.0028424753789815333 %optimizer_step 0.05632285793412169 %forward: 22.710868665007773 %backward: 60.82514199292311 [2025-04-05 18:55:29,524] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33789.01 | forward: 145882.71 | backward_microstep: 390723.48 | backward: 390708.82 | backward_inner_microstep: 390691.13 | backward_inner: 390684.06 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.84 | reduce_tied_grads: 0.38 | comms: 18.26 | reduce_grads: 0.21 | step: 361.79 | _step_clipping: 0.14 | _step_step: 359.91 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 15.941 | iteration 16280/ 143000 | elapsed time per iteration (ms): 64235.4 | learning rate: 5.842E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.399658E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 19:06:06,423] [INFO] [logging.py:60:log_dist] [Rank 0] step=16290, skipped=14, lr=[0.0005841844419126358, 0.0005841844419126358], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16290 loss: 2.3875 iter time (s): 63.689 samples/sec: 16.078 %comms: 0.0028955296095087586 %optimizer_step 0.05613548044272794 %forward: 22.886474619201312 %backward: 61.32996843561269 [2025-04-05 19:06:06,423] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28569.30 | forward: 145762.46 | backward_microstep: 390620.68 | backward: 390606.56 | backward_inner_microstep: 390589.17 | backward_inner: 390582.17 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.82 | reduce_tied_grads: 0.33 | comms: 18.44 | reduce_grads: 0.23 | step: 357.52 | _step_clipping: 0.15 | _step_step: 355.42 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.078 | iteration 16290/ 143000 | elapsed time per iteration (ms): 63690.0 | learning rate: 5.842E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.397153E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 19:16:40,457] [INFO] [logging.py:60:log_dist] [Rank 0] step=16300, skipped=14, lr=[0.0005841633181097722, 0.0005841633181097722], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16300 loss: 2.3883 iter time (s): 63.403 samples/sec: 16.151 %comms: 0.0029394876239175988 %optimizer_step 0.056516603868069025 %forward: 22.974994994889634 %backward: 61.57726208665878 [2025-04-05 19:16:40,457] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26008.16 | forward: 145667.95 | backward_microstep: 390430.40 | backward: 390417.20 | backward_inner_microstep: 390398.21 | backward_inner: 390391.43 | backward_allreduce_microstep: 9.83 | backward_allreduce: 4.56 | reduce_tied_grads: 0.34 | comms: 18.64 | reduce_grads: 0.40 | step: 358.33 | _step_clipping: 0.14 | _step_step: 356.35 | _step_zero_grad: 0.53 | _step_check_overflow: 0.64 samples/sec: 16.151 | iteration 16300/ 143000 | elapsed time per iteration (ms): 63403.4 | learning rate: 5.842E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.388893E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 19:27:34,979] [INFO] [logging.py:60:log_dist] [Rank 0] step=16310, skipped=14, lr=[0.0005841421805919127, 0.0005841421805919127], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16310 loss: 2.4109 iter time (s): 65.452 samples/sec: 15.645 %comms: 0.0028098802135985857 %optimizer_step 0.0575738037387683 %forward: 22.273510398209286 %backward: 59.67528054136668 [2025-04-05 19:27:34,980] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 46182.80 | forward: 145783.82 | backward_microstep: 390598.44 | backward: 390584.62 | backward_inner_microstep: 390566.12 | backward_inner: 390559.14 | backward_allreduce_microstep: 8.86 | backward_allreduce: 3.15 | reduce_tied_grads: 0.34 | comms: 18.39 | reduce_grads: 0.21 | step: 376.83 | _step_clipping: 0.16 | _step_step: 374.87 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 15.645 | iteration 16310/ 143000 | elapsed time per iteration (ms): 65452.3 | learning rate: 5.841E-04 | approx flops per GPU: 67.5TFLOPS | lm_loss: 2.405725E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 19:38:07,530] [INFO] [logging.py:60:log_dist] [Rank 0] step=16320, skipped=14, lr=[0.0005841210293600774, 0.0005841210293600774], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16320 loss: 2.3984 iter time (s): 63.255 samples/sec: 16.189 %comms: 0.002865869081136755 %optimizer_step 0.055728666287886024 %forward: 23.02234398005883 %backward: 61.74756529346666 [2025-04-05 19:38:07,531] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24385.41 | forward: 145626.74 | backward_microstep: 390593.64 | backward: 390581.28 | backward_inner_microstep: 390564.33 | backward_inner: 390557.32 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.20 | step: 352.51 | _step_clipping: 0.13 | _step_step: 350.47 | _step_zero_grad: 0.49 | _step_check_overflow: 0.81 samples/sec: 16.188 | iteration 16320/ 143000 | elapsed time per iteration (ms): 63255.1 | learning rate: 5.841E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.402488E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 19:48:44,590] [INFO] [logging.py:60:log_dist] [Rank 0] step=16330, skipped=14, lr=[0.000584099864415287, 0.000584099864415287], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16330 loss: 2.3954 iter time (s): 63.705 samples/sec: 16.074 %comms: 0.0029106732624307476 %optimizer_step 0.05589115419876022 %forward: 22.863853298007683 %backward: 61.28053814376682 [2025-04-05 19:48:44,590] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29016.13 | forward: 145654.84 | backward_microstep: 390401.92 | backward: 390389.44 | backward_inner_microstep: 390372.19 | backward_inner: 390365.27 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 18.54 | reduce_grads: 0.21 | step: 356.06 | _step_clipping: 0.13 | _step_step: 354.14 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 16.074 | iteration 16330/ 143000 | elapsed time per iteration (ms): 63705.9 | learning rate: 5.841E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.402769E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 19:59:15,395] [INFO] [logging.py:60:log_dist] [Rank 0] step=16340, skipped=14, lr=[0.0005840786857585635, 0.0005840786857585635], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16340 loss: 2.3956 iter time (s): 63.080 samples/sec: 16.233 %comms: 0.002878521559819956 %optimizer_step 0.0563930414576952 %forward: 23.060047912290056 %backward: 61.88260971884488 [2025-04-05 19:59:15,396] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23056.87 | forward: 145462.83 | backward_microstep: 390366.73 | backward: 390355.64 | backward_inner_microstep: 390337.39 | backward_inner: 390330.85 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.19 | step: 355.73 | _step_clipping: 0.13 | _step_step: 353.99 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.233 | iteration 16340/ 143000 | elapsed time per iteration (ms): 63080.7 | learning rate: 5.841E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.398730E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 20:09:51,170] [INFO] [logging.py:60:log_dist] [Rank 0] step=16350, skipped=14, lr=[0.0005840574933909286, 0.0005840574933909286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16350 loss: 2.3887 iter time (s): 63.577 samples/sec: 16.107 %comms: 0.00285392924412975 %optimizer_step 0.05571632802383441 %forward: 22.91002986614097 %backward: 61.38807763012451 [2025-04-05 20:09:51,171] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27869.30 | forward: 145654.64 | backward_microstep: 390295.93 | backward: 390285.76 | backward_inner_microstep: 390268.56 | backward_inner: 390261.92 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.82 | reduce_tied_grads: 0.30 | comms: 18.14 | reduce_grads: 0.19 | step: 354.23 | _step_clipping: 0.13 | _step_step: 352.39 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.106 | iteration 16350/ 143000 | elapsed time per iteration (ms): 63577.4 | learning rate: 5.841E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.392964E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 20:20:23,090] [INFO] [logging.py:60:log_dist] [Rank 0] step=16360, skipped=14, lr=[0.0005840362873134054, 0.0005840362873134054], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16360 loss: 2.3967 iter time (s): 63.191 samples/sec: 16.205 %comms: 0.002872878211459486 %optimizer_step 0.05613861491583343 %forward: 23.022000232633562 %backward: 61.761103380279835 [2025-04-05 20:20:23,091] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24214.53 | forward: 145479.44 | backward_microstep: 390288.18 | backward: 390277.59 | backward_inner_microstep: 390260.23 | backward_inner: 390253.54 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.83 | reduce_tied_grads: 0.32 | comms: 18.15 | reduce_grads: 0.20 | step: 354.75 | _step_clipping: 0.13 | _step_step: 352.86 | _step_zero_grad: 0.52 | _step_check_overflow: 0.64 samples/sec: 16.205 | iteration 16360/ 143000 | elapsed time per iteration (ms): 63192.1 | learning rate: 5.840E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.394225E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 20:31:11,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=16370, skipped=14, lr=[0.0005840150675270172, 0.0005840150675270172], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16370 loss: 2.3989 iter time (s): 64.792 samples/sec: 15.805 %comms: 0.0028166109808303776 %optimizer_step 0.05733414903910275 %forward: 22.527493000573788 %backward: 60.27146425466374 [2025-04-05 20:31:11,014] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39433.57 | forward: 145959.23 | backward_microstep: 390521.04 | backward: 390508.46 | backward_inner_microstep: 390490.58 | backward_inner: 390483.49 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.21 | step: 371.48 | _step_clipping: 0.14 | _step_step: 369.56 | _step_zero_grad: 0.58 | _step_check_overflow: 0.53 samples/sec: 15.804 | iteration 16370/ 143000 | elapsed time per iteration (ms): 64792.2 | learning rate: 5.840E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.411979E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 20:41:47,621] [INFO] [logging.py:60:log_dist] [Rank 0] step=16380, skipped=14, lr=[0.0005839938340327884, 0.0005839938340327884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16380 loss: 2.3859 iter time (s): 63.660 samples/sec: 16.085 %comms: 0.00283116291547967 %optimizer_step 0.05507987285093982 %forward: 22.86122598938885 %backward: 61.31016647817321 [2025-04-05 20:41:47,622] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28774.21 | forward: 145535.12 | backward_microstep: 390312.66 | backward: 390302.02 | backward_inner_microstep: 390285.31 | backward_inner: 390278.66 | backward_allreduce_microstep: 7.81 | backward_allreduce: 2.70 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.19 | step: 350.64 | _step_clipping: 0.13 | _step_step: 348.81 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.085 | iteration 16380/ 143000 | elapsed time per iteration (ms): 63660.8 | learning rate: 5.840E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.397319E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 20:52:25,075] [INFO] [logging.py:60:log_dist] [Rank 0] step=16390, skipped=14, lr=[0.0005839725868317436, 0.0005839725868317436], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16390 loss: 2.4003 iter time (s): 63.745 samples/sec: 16.064 %comms: 0.0029047202256844112 %optimizer_step 0.05705481373991343 %forward: 22.85309908119292 %backward: 61.26214726427439 [2025-04-05 20:52:25,075] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29203.40 | forward: 145676.49 | backward_microstep: 390529.40 | backward: 390513.97 | backward_inner_microstep: 390495.46 | backward_inner: 390488.19 | backward_allreduce_microstep: 8.77 | backward_allreduce: 3.00 | reduce_tied_grads: 0.37 | comms: 18.52 | reduce_grads: 0.21 | step: 363.69 | _step_clipping: 0.12 | _step_step: 361.77 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.064 | iteration 16390/ 143000 | elapsed time per iteration (ms): 63745.4 | learning rate: 5.840E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.392222E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 21:02:59,411] [INFO] [logging.py:60:log_dist] [Rank 0] step=16400, skipped=14, lr=[0.0005839513259249085, 0.0005839513259249085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16400 loss: 2.4222 iter time (s): 63.433 samples/sec: 16.143 %comms: 0.0028924978504381383 %optimizer_step 0.06046751402502136 %forward: 22.959569672574123 %backward: 61.56178880600574 [2025-04-05 21:02:59,411] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26088.15 | forward: 145639.41 | backward_microstep: 390518.20 | backward: 390504.83 | backward_inner_microstep: 390484.70 | backward_inner: 390477.76 | backward_allreduce_microstep: 8.88 | backward_allreduce: 2.98 | reduce_tied_grads: 0.37 | comms: 18.35 | reduce_grads: 0.23 | step: 383.56 | _step_clipping: 0.16 | _step_step: 381.64 | _step_zero_grad: 0.53 | _step_check_overflow: 0.57 samples/sec: 16.143 | iteration 16400/ 143000 | elapsed time per iteration (ms): 63433.6 | learning rate: 5.840E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.404197E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 21:13:31,793] [INFO] [logging.py:60:log_dist] [Rank 0] step=16410, skipped=14, lr=[0.0005839300513133089, 0.0005839300513133089], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16410 loss: 2.4016 iter time (s): 63.238 samples/sec: 16.193 %comms: 0.0028991344655467854 %optimizer_step 0.0569472410566471 %forward: 23.01464462244888 %backward: 61.70825316759282 [2025-04-05 21:13:31,793] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24572.43 | forward: 145539.12 | backward_microstep: 390238.79 | backward: 390228.28 | backward_inner_microstep: 390210.77 | backward_inner: 390204.02 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.33 | reduce_grads: 0.21 | step: 360.12 | _step_clipping: 0.13 | _step_step: 358.31 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.193 | iteration 16410/ 143000 | elapsed time per iteration (ms): 63238.2 | learning rate: 5.839E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.401975E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 21:24:05,492] [INFO] [logging.py:60:log_dist] [Rank 0] step=16420, skipped=14, lr=[0.0005839087629979722, 0.0005839087629979722], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16420 loss: 2.3907 iter time (s): 63.369 samples/sec: 16.159 %comms: 0.002896384478016618 %optimizer_step 0.05777501262890866 %forward: 22.97806215910927 %backward: 61.60418517191747 [2025-04-05 21:24:05,492] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25632.69 | forward: 145610.31 | backward_microstep: 390394.88 | backward: 390381.23 | backward_inner_microstep: 390363.41 | backward_inner: 390356.49 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.22 | step: 366.12 | _step_clipping: 0.13 | _step_step: 364.15 | _step_zero_grad: 0.56 | _step_check_overflow: 0.62 samples/sec: 16.159 | iteration 16420/ 143000 | elapsed time per iteration (ms): 63369.9 | learning rate: 5.839E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.398762E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 21:34:42,929] [INFO] [logging.py:60:log_dist] [Rank 0] step=16430, skipped=14, lr=[0.0005838874609799252, 0.0005838874609799252], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16430 loss: 2.4245 iter time (s): 63.743 samples/sec: 16.064 %comms: 0.002863165672806302 %optimizer_step 0.05728410954900877 %forward: 22.84858159728591 %backward: 61.26145028479265 [2025-04-05 21:34:42,930] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29212.54 | forward: 145643.93 | backward_microstep: 390514.07 | backward: 390499.44 | backward_inner_microstep: 390481.15 | backward_inner: 390474.07 | backward_allreduce_microstep: 8.65 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.21 | step: 365.15 | _step_clipping: 0.14 | _step_step: 363.08 | _step_zero_grad: 0.68 | _step_check_overflow: 0.60 samples/sec: 16.064 | iteration 16430/ 143000 | elapsed time per iteration (ms): 63743.7 | learning rate: 5.839E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.406979E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 21:45:24,779] [INFO] [logging.py:60:log_dist] [Rank 0] step=16440, skipped=14, lr=[0.0005838661452601965, 0.0005838661452601965], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16440 loss: 2.3893 iter time (s): 64.184 samples/sec: 15.954 %comms: 0.0028761316888489376 %optimizer_step 0.05650439074016097 %forward: 22.679415937710807 %backward: 60.81706744655515 [2025-04-05 21:45:24,780] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33873.72 | forward: 145566.43 | backward_microstep: 390363.14 | backward: 390350.59 | backward_inner_microstep: 390332.99 | backward_inner: 390326.25 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.84 | reduce_tied_grads: 0.36 | comms: 18.46 | reduce_grads: 0.22 | step: 362.67 | _step_clipping: 0.15 | _step_step: 360.74 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 15.954 | iteration 16440/ 143000 | elapsed time per iteration (ms): 64185.0 | learning rate: 5.839E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.407954E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 21:56:00,446] [INFO] [logging.py:60:log_dist] [Rank 0] step=16450, skipped=14, lr=[0.0005838448158398147, 0.0005838448158398147], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16450 loss: 2.4132 iter time (s): 63.566 samples/sec: 16.109 %comms: 0.0028979171642431532 %optimizer_step 0.05753301187600608 %forward: 22.944524229882123 %backward: 61.40730061874573 [2025-04-05 21:56:00,447] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27356.74 | forward: 145849.44 | backward_microstep: 390355.06 | backward: 390342.38 | backward_inner_microstep: 390323.63 | backward_inner: 390316.74 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.03 | reduce_tied_grads: 0.36 | comms: 18.42 | reduce_grads: 0.22 | step: 365.72 | _step_clipping: 0.16 | _step_step: 363.69 | _step_zero_grad: 0.57 | _step_check_overflow: 0.65 samples/sec: 16.109 | iteration 16450/ 143000 | elapsed time per iteration (ms): 63566.8 | learning rate: 5.838E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.396150E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 22:06:34,412] [INFO] [logging.py:60:log_dist] [Rank 0] step=16460, skipped=14, lr=[0.0005838234727198093, 0.0005838234727198093], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16460 loss: 2.4099 iter time (s): 63.396 samples/sec: 16.152 %comms: 0.0029123533224689317 %optimizer_step 0.05941208299410284 %forward: 22.98206851723975 %backward: 61.57992541962574 [2025-04-05 22:06:34,413] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25754.13 | forward: 145696.96 | backward_microstep: 390404.67 | backward: 390391.66 | backward_inner_microstep: 390371.93 | backward_inner: 390364.98 | backward_allreduce_microstep: 8.59 | backward_allreduce: 3.08 | reduce_tied_grads: 0.33 | comms: 18.46 | reduce_grads: 0.21 | step: 376.65 | _step_clipping: 0.18 | _step_step: 374.63 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.152 | iteration 16460/ 143000 | elapsed time per iteration (ms): 63396.6 | learning rate: 5.838E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.399541E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 22:17:11,299] [INFO] [logging.py:60:log_dist] [Rank 0] step=16470, skipped=14, lr=[0.0005838021159012104, 0.0005838021159012104], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16470 loss: 2.3849 iter time (s): 63.688 samples/sec: 16.078 %comms: 0.0028506312763541468 %optimizer_step 0.05742171964101892 %forward: 22.845728009371456 %backward: 61.285427942402734 [2025-04-05 22:17:11,300] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28982.93 | forward: 145499.85 | backward_microstep: 390326.31 | backward: 390314.58 | backward_inner_microstep: 390296.89 | backward_inner: 390290.09 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.23 | step: 365.71 | _step_clipping: 0.14 | _step_step: 363.81 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 16.078 | iteration 16470/ 143000 | elapsed time per iteration (ms): 63688.7 | learning rate: 5.838E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.399267E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 22:27:46,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=16480, skipped=14, lr=[0.0005837807453850487, 0.0005837807453850487], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16480 loss: 2.4145 iter time (s): 63.546 samples/sec: 16.114 %comms: 0.002874441044832415 %optimizer_step 0.056857188993580235 %forward: 22.9261059156791 %backward: 61.43518303960368 [2025-04-05 22:27:46,767] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27259.47 | forward: 145686.55 | backward_microstep: 390408.45 | backward: 390396.86 | backward_inner_microstep: 390378.66 | backward_inner: 390371.81 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.27 | reduce_grads: 0.21 | step: 361.31 | _step_clipping: 0.14 | _step_step: 359.57 | _step_zero_grad: 0.51 | _step_check_overflow: 0.48 samples/sec: 16.114 | iteration 16480/ 143000 | elapsed time per iteration (ms): 63546.7 | learning rate: 5.838E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.402445E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 22:38:26,033] [INFO] [logging.py:60:log_dist] [Rank 0] step=16490, skipped=14, lr=[0.0005837593611723559, 0.0005837593611723559], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16490 loss: 2.3936 iter time (s): 63.926 samples/sec: 16.018 %comms: 0.0028371784404372267 %optimizer_step 0.05633514072612243 %forward: 22.786078720412235 %backward: 61.069772169167045 [2025-04-05 22:38:26,034] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31087.08 | forward: 145662.54 | backward_microstep: 390408.52 | backward: 390395.30 | backward_inner_microstep: 390377.62 | backward_inner: 390370.51 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.14 | reduce_grads: 0.21 | step: 360.13 | _step_clipping: 0.14 | _step_step: 358.21 | _step_zero_grad: 0.50 | _step_check_overflow: 0.68 samples/sec: 16.018 | iteration 16490/ 143000 | elapsed time per iteration (ms): 63926.7 | learning rate: 5.838E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.403926E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 22:49:01,449] [INFO] [logging.py:60:log_dist] [Rank 0] step=16500, skipped=14, lr=[0.0005837379632641637, 0.0005837379632641637], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16500 loss: 2.3944 iter time (s): 63.541 samples/sec: 16.116 %comms: 0.002869274333827782 %optimizer_step 0.058339785204485156 %forward: 22.90016328080457 %backward: 61.43132599529058 [2025-04-05 22:49:01,450] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27420.50 | forward: 145509.72 | backward_microstep: 390357.71 | backward: 390340.24 | backward_inner_microstep: 390320.28 | backward_inner: 390313.32 | backward_allreduce_microstep: 8.65 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.23 | reduce_grads: 0.24 | step: 370.70 | _step_clipping: 0.14 | _step_step: 368.67 | _step_zero_grad: 0.60 | _step_check_overflow: 0.61 samples/sec: 16.115 | iteration 16500/ 143000 | elapsed time per iteration (ms): 63541.6 | learning rate: 5.837E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.399945E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 22:59:39,713] [INFO] [logging.py:60:log_dist] [Rank 0] step=16510, skipped=14, lr=[0.0005837165516615053, 0.0005837165516615053], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16510 loss: 2.3722 iter time (s): 63.826 samples/sec: 16.044 %comms: 0.0028590819235986705 %optimizer_step 0.05730266729953139 %forward: 22.806976966304177 %backward: 61.16089266065386 [2025-04-05 22:59:39,714] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30198.37 | forward: 145567.36 | backward_microstep: 390375.59 | backward: 390364.30 | backward_inner_microstep: 390343.59 | backward_inner: 390336.72 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.99 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.22 | step: 365.74 | _step_clipping: 0.15 | _step_step: 363.73 | _step_zero_grad: 0.59 | _step_check_overflow: 0.63 samples/sec: 16.044 | iteration 16510/ 143000 | elapsed time per iteration (ms): 63826.5 | learning rate: 5.837E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.393450E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 23:10:17,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=16520, skipped=14, lr=[0.0005836951263654138, 0.0005836951263654138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16520 loss: 2.3654 iter time (s): 63.728 samples/sec: 16.068 %comms: 0.002877142199738031 %optimizer_step 0.05713625428763165 %forward: 22.84468823026552 %backward: 61.28201907552124 [2025-04-05 23:10:17,005] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28992.48 | forward: 145585.65 | backward_microstep: 390554.62 | backward: 390540.80 | backward_inner_microstep: 390523.00 | backward_inner: 390516.06 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.34 | reduce_grads: 0.20 | step: 364.12 | _step_clipping: 0.13 | _step_step: 362.26 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.068 | iteration 16520/ 143000 | elapsed time per iteration (ms): 63729.1 | learning rate: 5.837E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.390148E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 23:20:47,512] [INFO] [logging.py:60:log_dist] [Rank 0] step=16530, skipped=14, lr=[0.0005836736873769232, 0.0005836736873769232], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16530 loss: 2.3960 iter time (s): 63.050 samples/sec: 16.241 %comms: 0.002887939934736649 %optimizer_step 0.057255719648855743 %forward: 23.110284348244164 %backward: 61.94969789231679 [2025-04-05 23:20:47,512] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21987.06 | forward: 145710.68 | backward_microstep: 390606.76 | backward: 390593.76 | backward_inner_microstep: 390575.48 | backward_inner: 390568.59 | backward_allreduce_microstep: 8.70 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.20 | step: 361.00 | _step_clipping: 0.14 | _step_step: 359.14 | _step_zero_grad: 0.49 | _step_check_overflow: 0.61 samples/sec: 16.241 | iteration 16530/ 143000 | elapsed time per iteration (ms): 63050.7 | learning rate: 5.837E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.389486E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 23:31:34,328] [INFO] [logging.py:60:log_dist] [Rank 0] step=16540, skipped=14, lr=[0.0005836522346970686, 0.0005836522346970686], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16540 loss: 2.3791 iter time (s): 64.681 samples/sec: 15.832 %comms: 0.00281228208361688 %optimizer_step 0.05584640414369674 %forward: 22.546084115837413 %backward: 60.38412724150548 [2025-04-05 23:31:34,329] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38247.93 | forward: 145830.52 | backward_microstep: 390584.07 | backward: 390571.08 | backward_inner_microstep: 390552.44 | backward_inner: 390545.32 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.07 | reduce_tied_grads: 0.34 | comms: 18.19 | reduce_grads: 0.23 | step: 361.22 | _step_clipping: 0.13 | _step_step: 359.30 | _step_zero_grad: 0.58 | _step_check_overflow: 0.59 samples/sec: 15.831 | iteration 16540/ 143000 | elapsed time per iteration (ms): 64681.7 | learning rate: 5.837E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.393431E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 23:42:14,721] [INFO] [logging.py:60:log_dist] [Rank 0] step=16550, skipped=14, lr=[0.0005836307683268851, 0.0005836307683268851], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16550 loss: 2.3796 iter time (s): 64.039 samples/sec: 15.990 %comms: 0.0028049371423899206 %optimizer_step 0.055146538542598454 %forward: 23.309658745502087 %backward: 60.94788988447015 [2025-04-05 23:42:14,722] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28693.70 | forward: 149272.05 | backward_microstep: 390311.72 | backward: 390302.43 | backward_inner_microstep: 390285.39 | backward_inner: 390278.93 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.82 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.19 | step: 353.15 | _step_clipping: 0.13 | _step_step: 351.43 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 15.990 | iteration 16550/ 143000 | elapsed time per iteration (ms): 64039.3 | learning rate: 5.836E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.387651E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-05 23:52:42,880] [INFO] [logging.py:60:log_dist] [Rank 0] step=16560, skipped=14, lr=[0.000583609288267409, 0.000583609288267409], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16560 loss: 2.3903 iter time (s): 62.815 samples/sec: 16.302 %comms: 0.0029118325277381635 %optimizer_step 0.058774518025474815 %forward: 23.16538999432939 %backward: 62.161305710538485 [2025-04-05 23:52:42,881] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19960.47 | forward: 145514.05 | backward_microstep: 390483.98 | backward: 390467.99 | backward_inner_microstep: 390450.40 | backward_inner: 390443.32 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.83 | reduce_tied_grads: 0.35 | comms: 18.29 | reduce_grads: 0.23 | step: 369.19 | _step_clipping: 0.14 | _step_step: 367.26 | _step_zero_grad: 0.56 | _step_check_overflow: 0.61 samples/sec: 16.302 | iteration 16560/ 143000 | elapsed time per iteration (ms): 62815.9 | learning rate: 5.836E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.396355E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 00:03:21,669] [INFO] [logging.py:60:log_dist] [Rank 0] step=16570, skipped=14, lr=[0.0005835877945196768, 0.0005835877945196768], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16570 loss: 2.4205 iter time (s): 63.878 samples/sec: 16.030 %comms: 0.0028546819243391443 %optimizer_step 0.05730119674970389 %forward: 22.797060137597796 %backward: 61.11107840914539 [2025-04-06 00:03:21,670] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30606.95 | forward: 145623.62 | backward_microstep: 390378.35 | backward: 390366.83 | backward_inner_microstep: 390348.14 | backward_inner: 390340.80 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.06 | reduce_tied_grads: 0.36 | comms: 18.24 | reduce_grads: 0.24 | step: 366.03 | _step_clipping: 0.15 | _step_step: 363.93 | _step_zero_grad: 0.59 | _step_check_overflow: 0.69 samples/sec: 16.030 | iteration 16570/ 143000 | elapsed time per iteration (ms): 63878.9 | learning rate: 5.836E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.395998E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 00:13:57,014] [INFO] [logging.py:60:log_dist] [Rank 0] step=16580, skipped=14, lr=[0.0005835662870847261, 0.0005835662870847261], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16580 loss: 2.3952 iter time (s): 63.534 samples/sec: 16.117 %comms: 0.0028359683473194574 %optimizer_step 0.05560405042791454 %forward: 22.907189515882727 %backward: 61.47108210363017 [2025-04-06 00:13:57,015] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27047.64 | forward: 145538.26 | backward_microstep: 390563.24 | backward: 390549.64 | backward_inner_microstep: 390531.57 | backward_inner: 390524.71 | backward_allreduce_microstep: 8.70 | backward_allreduce: 2.99 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.20 | step: 353.27 | _step_clipping: 0.12 | _step_step: 351.47 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.117 | iteration 16580/ 143000 | elapsed time per iteration (ms): 63534.5 | learning rate: 5.836E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.395431E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 00:24:32,598] [INFO] [logging.py:60:log_dist] [Rank 0] step=16590, skipped=14, lr=[0.0005835447659635947, 0.0005835447659635947], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16590 loss: 2.3866 iter time (s): 63.558 samples/sec: 16.111 %comms: 0.0028382398370746047 %optimizer_step 0.057092689929827974 %forward: 22.928157542343914 %backward: 61.46137904473128 [2025-04-06 00:24:32,599] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26957.23 | forward: 145726.31 | backward_microstep: 390646.48 | backward: 390634.97 | backward_inner_microstep: 390617.17 | backward_inner: 390610.44 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.22 | step: 362.87 | _step_clipping: 0.14 | _step_step: 361.09 | _step_zero_grad: 0.52 | _step_check_overflow: 0.52 samples/sec: 16.111 | iteration 16590/ 143000 | elapsed time per iteration (ms): 63558.4 | learning rate: 5.835E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.388331E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 00:35:04,190] [INFO] [logging.py:60:log_dist] [Rank 0] step=16600, skipped=14, lr=[0.0005835232311573217, 0.0005835232311573217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16600 loss: 2.3864 iter time (s): 63.159 samples/sec: 16.213 %comms: 0.0028893997164277387 %optimizer_step 0.058103690937324926 %forward: 23.062391764037717 %backward: 61.856991913330006 [2025-04-06 00:35:04,190] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22970.46 | forward: 145658.76 | backward_microstep: 390695.98 | backward: 390679.90 | backward_inner_microstep: 390660.03 | backward_inner: 390652.97 | backward_allreduce_microstep: 10.35 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.25 | reduce_grads: 0.22 | step: 366.97 | _step_clipping: 0.13 | _step_step: 365.05 | _step_zero_grad: 0.54 | _step_check_overflow: 0.61 samples/sec: 16.213 | iteration 16600/ 143000 | elapsed time per iteration (ms): 63159.2 | learning rate: 5.835E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.385826E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 00:45:40,472] [INFO] [logging.py:60:log_dist] [Rank 0] step=16610, skipped=14, lr=[0.0005835016826669459, 0.0005835016826669459], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16610 loss: 2.3946 iter time (s): 63.628 samples/sec: 16.094 %comms: 0.002833776756482027 %optimizer_step 0.056347964235964214 %forward: 22.95717953183965 %backward: 61.40860667260232 [2025-04-06 00:45:40,472] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27205.38 | forward: 146071.04 | backward_microstep: 390743.52 | backward: 390728.27 | backward_inner_microstep: 390710.52 | backward_inner: 390703.54 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.20 | step: 358.53 | _step_clipping: 0.13 | _step_step: 356.82 | _step_zero_grad: 0.48 | _step_check_overflow: 0.52 samples/sec: 16.094 | iteration 16610/ 143000 | elapsed time per iteration (ms): 63628.2 | learning rate: 5.835E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.396494E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 00:56:16,018] [INFO] [logging.py:60:log_dist] [Rank 0] step=16620, skipped=14, lr=[0.0005834801204935078, 0.0005834801204935078], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16620 loss: 2.3921 iter time (s): 63.554 samples/sec: 16.112 %comms: 0.002882409945519946 %optimizer_step 0.056932313845124545 %forward: 22.94284543315952 %backward: 61.45631123180004 [2025-04-06 00:56:16,019] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26893.21 | forward: 145811.15 | backward_microstep: 390592.99 | backward: 390579.94 | backward_inner_microstep: 390561.82 | backward_inner: 390554.75 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.32 | reduce_grads: 0.22 | step: 361.83 | _step_clipping: 0.12 | _step_step: 359.92 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 16.112 | iteration 16620/ 143000 | elapsed time per iteration (ms): 63554.7 | learning rate: 5.835E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.397631E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 01:06:53,288] [INFO] [logging.py:60:log_dist] [Rank 0] step=16630, skipped=14, lr=[0.0005834585446380479, 0.0005834585446380479], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16630 loss: 2.3691 iter time (s): 63.726 samples/sec: 16.069 %comms: 0.0028491041281588684 %optimizer_step 0.057263015985345174 %forward: 22.855335455191657 %backward: 61.270312723854204 [2025-04-06 01:06:53,288] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28906.18 | forward: 145648.63 | backward_microstep: 390466.49 | backward: 390453.11 | backward_inner_microstep: 390435.01 | backward_inner: 390428.22 | backward_allreduce_microstep: 8.68 | backward_allreduce: 2.98 | reduce_tied_grads: 0.35 | comms: 18.16 | reduce_grads: 0.23 | step: 364.92 | _step_clipping: 0.14 | _step_step: 362.98 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 16.069 | iteration 16630/ 143000 | elapsed time per iteration (ms): 63726.9 | learning rate: 5.835E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.384521E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 01:17:25,439] [INFO] [logging.py:60:log_dist] [Rank 0] step=16640, skipped=14, lr=[0.0005834369551016075, 0.0005834369551016075], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16640 loss: 2.3918 iter time (s): 63.215 samples/sec: 16.199 %comms: 0.00291342980985906 %optimizer_step 0.05806361100660313 %forward: 23.057616370282215 %backward: 61.8027353829546 [2025-04-06 01:17:25,440] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23403.07 | forward: 145757.72 | backward_microstep: 390697.17 | backward: 390683.31 | backward_inner_microstep: 390665.26 | backward_inner: 390658.27 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.93 | reduce_tied_grads: 0.34 | comms: 18.42 | reduce_grads: 0.22 | step: 367.05 | _step_clipping: 0.14 | _step_step: 365.05 | _step_zero_grad: 0.54 | _step_check_overflow: 0.67 samples/sec: 16.199 | iteration 16640/ 143000 | elapsed time per iteration (ms): 63215.2 | learning rate: 5.834E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.394390E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 01:27:59,949] [INFO] [logging.py:60:log_dist] [Rank 0] step=16650, skipped=14, lr=[0.0005834153518852287, 0.0005834153518852287], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16650 loss: 2.3807 iter time (s): 63.450 samples/sec: 16.139 %comms: 0.002837782907316983 %optimizer_step 0.05638110477089759 %forward: 22.958653874287116 %backward: 61.55521831606647 [2025-04-06 01:27:59,950] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25991.95 | forward: 145673.59 | backward_microstep: 390582.54 | backward: 390570.36 | backward_inner_microstep: 390549.30 | backward_inner: 390540.87 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.87 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.20 | step: 357.74 | _step_clipping: 0.12 | _step_step: 356.05 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.138 | iteration 16650/ 143000 | elapsed time per iteration (ms): 63451.0 | learning rate: 5.834E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.398362E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 01:38:36,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=16660, skipped=14, lr=[0.0005833937349899542, 0.0005833937349899542], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16660 loss: 2.3914 iter time (s): 63.683 samples/sec: 16.080 %comms: 0.0028379960525752672 %optimizer_step 0.05607266057003835 %forward: 22.86620322644884 %backward: 61.3162162276785 [2025-04-06 01:38:36,781] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28446.19 | forward: 145617.81 | backward_microstep: 390488.93 | backward: 390477.29 | backward_inner_microstep: 390458.47 | backward_inner: 390449.98 | backward_allreduce_microstep: 9.78 | backward_allreduce: 4.44 | reduce_tied_grads: 0.33 | comms: 18.07 | reduce_grads: 0.21 | step: 357.08 | _step_clipping: 0.15 | _step_step: 355.22 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.080 | iteration 16660/ 143000 | elapsed time per iteration (ms): 63683.1 | learning rate: 5.834E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.399786E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 01:49:14,788] [INFO] [logging.py:60:log_dist] [Rank 0] step=16670, skipped=14, lr=[0.0005833721044168273, 0.0005833721044168273], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16670 loss: 2.3848 iter time (s): 63.800 samples/sec: 16.050 %comms: 0.0028554871930476363 %optimizer_step 0.05652532041683306 %forward: 22.813588056812033 %backward: 61.181799578747096 [2025-04-06 01:49:14,788] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29823.18 | forward: 145550.95 | backward_microstep: 390353.53 | backward: 390340.57 | backward_inner_microstep: 390323.22 | backward_inner: 390314.51 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 18.22 | reduce_grads: 0.20 | step: 360.63 | _step_clipping: 0.12 | _step_step: 358.79 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 16.050 | iteration 16670/ 143000 | elapsed time per iteration (ms): 63800.7 | learning rate: 5.834E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.390432E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 01:59:48,931] [INFO] [logging.py:60:log_dist] [Rank 0] step=16680, skipped=14, lr=[0.000583350460166892, 0.000583350460166892], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16680 loss: 2.3902 iter time (s): 63.414 samples/sec: 16.148 %comms: 0.002848559434373165 %optimizer_step 0.056270450210094504 %forward: 22.92709224600045 %backward: 61.524735860338055 [2025-04-06 01:59:48,932] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26347.50 | forward: 145389.29 | backward_microstep: 390160.73 | backward: 390151.42 | backward_inner_microstep: 390133.96 | backward_inner: 390127.35 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.06 | reduce_grads: 0.20 | step: 356.83 | _step_clipping: 0.12 | _step_step: 354.85 | _step_zero_grad: 0.51 | _step_check_overflow: 0.73 samples/sec: 16.148 | iteration 16680/ 143000 | elapsed time per iteration (ms): 63414.4 | learning rate: 5.834E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.392147E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 02:10:20,731] [INFO] [logging.py:60:log_dist] [Rank 0] step=16690, skipped=14, lr=[0.0005833288022411927, 0.0005833288022411927], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16690 loss: 2.3889 iter time (s): 63.179 samples/sec: 16.208 %comms: 0.0029637327807911303 %optimizer_step 0.058227573432028834 %forward: 23.02704748982168 %backward: 61.80523604119852 [2025-04-06 02:10:20,732] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23504.16 | forward: 145483.46 | backward_microstep: 390497.30 | backward: 390481.65 | backward_inner_microstep: 390463.52 | backward_inner: 390456.00 | backward_allreduce_microstep: 8.52 | backward_allreduce: 3.00 | reduce_tied_grads: 0.38 | comms: 18.72 | reduce_grads: 0.24 | step: 367.88 | _step_clipping: 0.14 | _step_step: 365.83 | _step_zero_grad: 0.52 | _step_check_overflow: 0.71 samples/sec: 16.208 | iteration 16690/ 143000 | elapsed time per iteration (ms): 63180.0 | learning rate: 5.833E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.410868E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 02:20:52,814] [INFO] [logging.py:60:log_dist] [Rank 0] step=16700, skipped=14, lr=[0.0005833071306407751, 0.0005833071306407751], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16700 loss: 2.4000 iter time (s): 63.208 samples/sec: 16.201 %comms: 0.0028646023893635552 %optimizer_step 0.05726273940140079 %forward: 23.03232142674713 %backward: 61.765944524987646 [2025-04-06 02:20:52,814] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23732.96 | forward: 145581.75 | backward_microstep: 390425.46 | backward: 390407.63 | backward_inner_microstep: 390389.57 | backward_inner: 390382.76 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.11 | reduce_grads: 0.20 | step: 361.94 | _step_clipping: 0.12 | _step_step: 360.15 | _step_zero_grad: 0.54 | _step_check_overflow: 0.51 samples/sec: 16.200 | iteration 16700/ 143000 | elapsed time per iteration (ms): 63208.2 | learning rate: 5.833E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.399983E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 02:31:31,798] [INFO] [logging.py:60:log_dist] [Rank 0] step=16710, skipped=14, lr=[0.0005832854453666848, 0.0005832854453666848], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16710 loss: 2.3872 iter time (s): 63.898 samples/sec: 16.026 %comms: 0.002866456358227878 %optimizer_step 0.05918590791712507 %forward: 22.800782019651443 %backward: 61.09410615737975 [2025-04-06 02:31:31,798] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30519.94 | forward: 145692.02 | backward_microstep: 390390.93 | backward: 390378.00 | backward_inner_microstep: 390359.80 | backward_inner: 390352.65 | backward_allreduce_microstep: 8.73 | backward_allreduce: 3.15 | reduce_tied_grads: 0.36 | comms: 18.32 | reduce_grads: 0.24 | step: 378.19 | _step_clipping: 0.15 | _step_step: 376.24 | _step_zero_grad: 0.58 | _step_check_overflow: 0.57 samples/sec: 16.025 | iteration 16710/ 143000 | elapsed time per iteration (ms): 63898.4 | learning rate: 5.833E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.391273E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 02:42:06,272] [INFO] [logging.py:60:log_dist] [Rank 0] step=16720, skipped=14, lr=[0.0005832637464199689, 0.0005832637464199689], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16720 loss: 2.3834 iter time (s): 63.447 samples/sec: 16.139 %comms: 0.0028686798448800867 %optimizer_step 0.056992296211382215 %forward: 22.954162009967373 %backward: 61.51209347922981 [2025-04-06 02:42:06,273] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26230.80 | forward: 145636.96 | backward_microstep: 390285.37 | backward: 390274.95 | backward_inner_microstep: 390257.41 | backward_inner: 390250.66 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.93 | reduce_tied_grads: 0.34 | comms: 18.20 | reduce_grads: 0.22 | step: 361.60 | _step_clipping: 0.15 | _step_step: 359.43 | _step_zero_grad: 0.52 | _step_check_overflow: 0.87 samples/sec: 16.139 | iteration 16720/ 143000 | elapsed time per iteration (ms): 63447.5 | learning rate: 5.833E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.387902E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 02:52:42,957] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-06 02:52:42,958] [INFO] [logging.py:60:log_dist] [Rank 0] step=16730, skipped=15, lr=[0.0005832442056786948, 0.0005832442056786948], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16730 loss: 2.3939 iter time (s): 63.668 samples/sec: 16.083 %comms: 0.002558506593256849 %optimizer_step 0.050520422552123696 %forward: 22.873576837213953 %backward: 61.34162212527988 [2025-04-06 02:52:42,958] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28168.10 | forward: 145631.25 | backward_microstep: 390563.45 | backward: 390549.19 | backward_inner_microstep: 390531.33 | backward_inner: 390524.21 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 16.29 | reduce_grads: 0.21 | step: 321.65 | _step_clipping: 0.14 | _step_step: 320.05 | _step_zero_grad: 0.48 | _step_check_overflow: 0.39 samples/sec: 16.083 | iteration 16730/ 143000 | elapsed time per iteration (ms): 63668.5 | learning rate: 5.832E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.401754E+00 | loss scale: 524288.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-06 02:53:46,506] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-06 03:03:19,813] [INFO] [logging.py:60:log_dist] [Rank 0] step=16740, skipped=16, lr=[0.0005832246538642061, 0.0005832246538642061], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16740 loss: 2.4060 iter time (s): 63.685 samples/sec: 16.079 %comms: 0.0025866880937514777 %optimizer_step 0.052794770461375845 %forward: 22.870910531770804 %backward: 61.31385717155551 [2025-04-06 03:03:19,813] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28437.67 | forward: 145653.13 | backward_microstep: 390491.41 | backward: 390476.59 | backward_inner_microstep: 390457.04 | backward_inner: 390449.71 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 16.47 | reduce_grads: 0.21 | step: 336.22 | _step_clipping: 0.12 | _step_step: 334.20 | _step_zero_grad: 0.60 | _step_check_overflow: 0.67 samples/sec: 16.079 | iteration 16740/ 143000 | elapsed time per iteration (ms): 63685.5 | learning rate: 5.832E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.399355E+00 | loss scale: 262144.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-06 03:13:59,039] [INFO] [logging.py:60:log_dist] [Rank 0] step=16750, skipped=16, lr=[0.0005832029166397112, 0.0005832029166397112], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16750 loss: 2.3919 iter time (s): 63.922 samples/sec: 16.020 %comms: 0.0028367977974624037 %optimizer_step 0.054930158121774694 %forward: 22.767539701243837 %backward: 61.079479265510074 [2025-04-06 03:13:59,040] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31029.29 | forward: 145534.85 | backward_microstep: 390444.60 | backward: 390432.74 | backward_inner_microstep: 390415.58 | backward_inner: 390408.86 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.80 | reduce_tied_grads: 0.29 | comms: 18.13 | reduce_grads: 0.20 | step: 351.13 | _step_clipping: 0.13 | _step_step: 349.35 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.019 | iteration 16750/ 143000 | elapsed time per iteration (ms): 63922.7 | learning rate: 5.832E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.395146E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 03:24:47,028] [INFO] [logging.py:60:log_dist] [Rank 0] step=16760, skipped=16, lr=[0.0005831811657465736, 0.0005831811657465736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16760 loss: 2.3933 iter time (s): 64.798 samples/sec: 15.803 %comms: 0.002829639309705603 %optimizer_step 0.05762040463609591 %forward: 22.498316373809217 %backward: 60.27770564761873 [2025-04-06 03:24:47,029] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39349.83 | forward: 145785.26 | backward_microstep: 390603.86 | backward: 390589.27 | backward_inner_microstep: 390571.26 | backward_inner: 390564.44 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.93 | reduce_tied_grads: 0.33 | comms: 18.34 | reduce_grads: 0.23 | step: 373.37 | _step_clipping: 0.17 | _step_step: 371.26 | _step_zero_grad: 0.56 | _step_check_overflow: 0.70 samples/sec: 15.803 | iteration 16760/ 143000 | elapsed time per iteration (ms): 64798.9 | learning rate: 5.832E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.391432E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 03:35:23,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=16770, skipped=16, lr=[0.0005831594011858433, 0.0005831594011858433], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16770 loss: 2.3842 iter time (s): 63.597 samples/sec: 16.101 %comms: 0.0029709435935715094 %optimizer_step 0.059620226328843284 %forward: 22.89017266166092 %backward: 61.400457320622415 [2025-04-06 03:35:23,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27686.19 | forward: 145573.75 | backward_microstep: 390499.23 | backward: 390486.12 | backward_inner_microstep: 390468.87 | backward_inner: 390462.04 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.79 | reduce_tied_grads: 0.36 | comms: 18.89 | reduce_grads: 0.23 | step: 379.16 | _step_clipping: 0.16 | _step_step: 376.93 | _step_zero_grad: 0.66 | _step_check_overflow: 0.69 samples/sec: 16.101 | iteration 16770/ 143000 | elapsed time per iteration (ms): 63597.4 | learning rate: 5.832E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.392514E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 03:45:58,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=16780, skipped=16, lr=[0.0005831376229585704, 0.0005831376229585704], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16780 loss: 2.3797 iter time (s): 63.517 samples/sec: 16.122 %comms: 0.002844511260238057 %optimizer_step 0.055935898808874465 %forward: 22.900234995592918 %backward: 61.437164517664435 [2025-04-06 03:45:58,174] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27355.13 | forward: 145454.44 | backward_microstep: 390238.45 | backward: 390227.80 | backward_inner_microstep: 390210.29 | backward_inner: 390203.54 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.22 | step: 355.29 | _step_clipping: 0.13 | _step_step: 353.52 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.122 | iteration 16780/ 143000 | elapsed time per iteration (ms): 63517.2 | learning rate: 5.831E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.387081E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 03:56:30,184] [INFO] [logging.py:60:log_dist] [Rank 0] step=16790, skipped=16, lr=[0.0005831158310658062, 0.0005831158310658062], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16790 loss: 2.3948 iter time (s): 63.200 samples/sec: 16.202 %comms: 0.0029048019500175683 %optimizer_step 0.05748490678661018 %forward: 23.026853322248673 %backward: 61.78824116086263 [2025-04-06 03:56:30,185] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23790.86 | forward: 145530.67 | backward_microstep: 390518.07 | backward: 390504.25 | backward_inner_microstep: 390486.58 | backward_inner: 390477.79 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.86 | reduce_tied_grads: 0.34 | comms: 18.36 | reduce_grads: 0.22 | step: 363.31 | _step_clipping: 0.14 | _step_step: 361.43 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.202 | iteration 16790/ 143000 | elapsed time per iteration (ms): 63201.0 | learning rate: 5.831E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.386704E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 04:07:05,382] [INFO] [logging.py:60:log_dist] [Rank 0] step=16800, skipped=16, lr=[0.0005830940255086025, 0.0005830940255086025], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16800 loss: 2.3910 iter time (s): 63.519 samples/sec: 16.121 %comms: 0.0028485979651572003 %optimizer_step 0.058363077441876104 %forward: 22.964322809254615 %backward: 61.51524210538402 [2025-04-06 04:07:05,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26371.93 | forward: 145867.51 | backward_microstep: 390756.61 | backward: 390739.82 | backward_inner_microstep: 390721.31 | backward_inner: 390714.05 | backward_allreduce_microstep: 8.76 | backward_allreduce: 2.97 | reduce_tied_grads: 0.31 | comms: 18.09 | reduce_grads: 0.20 | step: 370.72 | _step_clipping: 0.15 | _step_step: 368.90 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.121 | iteration 16800/ 143000 | elapsed time per iteration (ms): 63519.8 | learning rate: 5.831E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.384898E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 04:17:46,293] [INFO] [logging.py:60:log_dist] [Rank 0] step=16810, skipped=16, lr=[0.0005830722062880118, 0.0005830722062880118], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16810 loss: 2.4099 iter time (s): 64.091 samples/sec: 15.977 %comms: 0.0028668391895882305 %optimizer_step 0.05661335004466901 %forward: 22.74282947643871 %backward: 60.93845608310418 [2025-04-06 04:17:46,294] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32383.96 | forward: 145760.03 | backward_microstep: 390571.42 | backward: 390557.87 | backward_inner_microstep: 390540.06 | backward_inner: 390532.91 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.90 | reduce_tied_grads: 0.36 | comms: 18.37 | reduce_grads: 0.24 | step: 362.84 | _step_clipping: 0.15 | _step_step: 360.79 | _step_zero_grad: 0.59 | _step_check_overflow: 0.58 samples/sec: 15.977 | iteration 16810/ 143000 | elapsed time per iteration (ms): 64091.2 | learning rate: 5.831E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.398094E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 04:28:22,834] [INFO] [logging.py:60:log_dist] [Rank 0] step=16820, skipped=16, lr=[0.0005830503734050869, 0.0005830503734050869], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16820 loss: 2.3936 iter time (s): 63.653 samples/sec: 16.087 %comms: 0.0028865242251542786 %optimizer_step 0.056451372035830415 %forward: 22.909704904322794 %backward: 61.374992348126355 [2025-04-06 04:28:22,835] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27886.55 | forward: 145828.22 | backward_microstep: 390689.65 | backward: 390673.11 | backward_inner_microstep: 390654.90 | backward_inner: 390647.77 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.94 | reduce_tied_grads: 0.36 | comms: 18.37 | reduce_grads: 0.23 | step: 359.33 | _step_clipping: 0.16 | _step_step: 357.32 | _step_zero_grad: 0.59 | _step_check_overflow: 0.59 samples/sec: 16.087 | iteration 16820/ 143000 | elapsed time per iteration (ms): 63654.1 | learning rate: 5.831E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.391116E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 04:38:58,322] [INFO] [logging.py:60:log_dist] [Rank 0] step=16830, skipped=16, lr=[0.0005830285268608818, 0.0005830285268608818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16830 loss: 2.3998 iter time (s): 63.548 samples/sec: 16.114 %comms: 0.0030192403633040694 %optimizer_step 0.057905822401150456 %forward: 22.953359972058855 %backward: 61.458482642616765 [2025-04-06 04:38:58,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26920.64 | forward: 145864.52 | backward_microstep: 390569.50 | backward: 390557.72 | backward_inner_microstep: 390540.27 | backward_inner: 390533.49 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.83 | reduce_tied_grads: 0.37 | comms: 19.19 | reduce_grads: 0.23 | step: 367.98 | _step_clipping: 0.14 | _step_step: 366.14 | _step_zero_grad: 0.54 | _step_check_overflow: 0.52 samples/sec: 16.114 | iteration 16830/ 143000 | elapsed time per iteration (ms): 63548.8 | learning rate: 5.830E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.392048E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 04:49:35,278] [INFO] [logging.py:60:log_dist] [Rank 0] step=16840, skipped=16, lr=[0.000583006666656451, 0.000583006666656451], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16840 loss: 2.4028 iter time (s): 63.695 samples/sec: 16.077 %comms: 0.0028809759620475757 %optimizer_step 0.057447784371281736 %forward: 22.891776797712595 %backward: 61.32330080733419 [2025-04-06 04:49:35,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28407.80 | forward: 145809.06 | backward_microstep: 390610.03 | backward: 390598.47 | backward_inner_microstep: 390580.71 | backward_inner: 390573.90 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.93 | reduce_tied_grads: 0.35 | comms: 18.35 | reduce_grads: 0.22 | step: 365.91 | _step_clipping: 0.14 | _step_step: 363.97 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 16.076 | iteration 16840/ 143000 | elapsed time per iteration (ms): 63695.5 | learning rate: 5.830E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.408495E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 05:00:15,417] [INFO] [logging.py:60:log_dist] [Rank 0] step=16850, skipped=16, lr=[0.0005829847927928492, 0.0005829847927928492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16850 loss: 2.3935 iter time (s): 64.013 samples/sec: 15.997 %comms: 0.0028376676088658725 %optimizer_step 0.057128298689523314 %forward: 22.775026706288816 %backward: 61.02966724369695 [2025-04-06 05:00:15,418] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31475.28 | forward: 145790.67 | backward_microstep: 390685.02 | backward: 390671.61 | backward_inner_microstep: 390653.74 | backward_inner: 390646.90 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.85 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.22 | step: 365.70 | _step_clipping: 0.13 | _step_step: 363.80 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 15.997 | iteration 16850/ 143000 | elapsed time per iteration (ms): 64014.0 | learning rate: 5.830E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.395967E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 05:10:49,327] [INFO] [logging.py:60:log_dist] [Rank 0] step=16860, skipped=16, lr=[0.0005829629052711326, 0.0005829629052711326], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16860 loss: 2.4059 iter time (s): 63.390 samples/sec: 16.154 %comms: 0.002868902454701814 %optimizer_step 0.05694266254987376 %forward: 22.974594133009553 %backward: 61.61373334149626 [2025-04-06 05:10:49,328] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25527.75 | forward: 145636.91 | backward_microstep: 390584.02 | backward: 390572.03 | backward_inner_microstep: 390554.99 | backward_inner: 390548.26 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.77 | reduce_tied_grads: 0.35 | comms: 18.19 | reduce_grads: 0.22 | step: 360.96 | _step_clipping: 0.15 | _step_step: 359.05 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 16.154 | iteration 16860/ 143000 | elapsed time per iteration (ms): 63391.0 | learning rate: 5.830E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.397306E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 05:21:28,071] [INFO] [logging.py:60:log_dist] [Rank 0] step=16870, skipped=16, lr=[0.0005829410040923571, 0.0005829410040923571], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16870 loss: 2.3923 iter time (s): 63.874 samples/sec: 16.032 %comms: 0.00285812662130033 %optimizer_step 0.0558533804626149 %forward: 22.793970321600764 %backward: 61.14222308012762 [2025-04-06 05:21:28,072] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30460.06 | forward: 145593.82 | backward_microstep: 390550.86 | backward: 390538.79 | backward_inner_microstep: 390521.71 | backward_inner: 390515.08 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.76 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.21 | step: 356.76 | _step_clipping: 0.13 | _step_step: 354.92 | _step_zero_grad: 0.48 | _step_check_overflow: 0.65 samples/sec: 16.031 | iteration 16870/ 143000 | elapsed time per iteration (ms): 63874.4 | learning rate: 5.829E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.393724E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 05:31:55,768] [INFO] [logging.py:60:log_dist] [Rank 0] step=16880, skipped=16, lr=[0.0005829190892575803, 0.0005829190892575803], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16880 loss: 2.4136 iter time (s): 62.769 samples/sec: 16.314 %comms: 0.0029144332726452143 %optimizer_step 0.05884907349294922 %forward: 23.19178400517671 %backward: 62.21073258875358 [2025-04-06 05:31:55,769] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19479.91 | forward: 145572.61 | backward_microstep: 390504.45 | backward: 390490.82 | backward_inner_microstep: 390473.29 | backward_inner: 390466.43 | backward_allreduce_microstep: 8.31 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.29 | reduce_grads: 0.21 | step: 369.39 | _step_clipping: 0.14 | _step_step: 367.46 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.314 | iteration 16880/ 143000 | elapsed time per iteration (ms): 62769.7 | learning rate: 5.829E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.410350E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 05:42:35,599] [INFO] [logging.py:60:log_dist] [Rank 0] step=16890, skipped=16, lr=[0.0005828971607678593, 0.0005828971607678593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16890 loss: 2.4139 iter time (s): 63.983 samples/sec: 16.004 %comms: 0.002815596065961236 %optimizer_step 0.055120137187704886 %forward: 22.75775474237132 %backward: 61.00346065598108 [2025-04-06 05:42:35,600] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31765.57 | forward: 145609.97 | backward_microstep: 390327.53 | backward: 390315.83 | backward_inner_microstep: 390298.42 | backward_inner: 390291.68 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.88 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.20 | step: 352.67 | _step_clipping: 0.13 | _step_step: 350.98 | _step_zero_grad: 0.52 | _step_check_overflow: 0.46 samples/sec: 16.004 | iteration 16890/ 143000 | elapsed time per iteration (ms): 63983.1 | learning rate: 5.829E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.395862E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 05:53:08,999] [INFO] [logging.py:60:log_dist] [Rank 0] step=16900, skipped=16, lr=[0.0005828752186242529, 0.0005828752186242529], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16900 loss: 2.3761 iter time (s): 63.339 samples/sec: 16.167 %comms: 0.0029043416728432797 %optimizer_step 0.057781561016331126 %forward: 22.980528617459477 %backward: 61.66622908378022 [2025-04-06 05:53:08,999] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25020.56 | forward: 145557.09 | backward_microstep: 390604.56 | backward: 390589.66 | backward_inner_microstep: 390571.46 | backward_inner: 390564.58 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.87 | reduce_tied_grads: 0.35 | comms: 18.40 | reduce_grads: 0.22 | step: 365.98 | _step_clipping: 0.14 | _step_step: 364.06 | _step_zero_grad: 0.52 | _step_check_overflow: 0.63 samples/sec: 16.167 | iteration 16900/ 143000 | elapsed time per iteration (ms): 63339.9 | learning rate: 5.829E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.384414E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 06:03:46,725] [INFO] [logging.py:60:log_dist] [Rank 0] step=16910, skipped=16, lr=[0.00058285326282782, 0.00058285326282782], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16910 loss: 2.3807 iter time (s): 63.772 samples/sec: 16.057 %comms: 0.002845339172815088 %optimizer_step 0.05523181419433556 %forward: 22.815169732046723 %backward: 61.21262363311253 [2025-04-06 06:03:46,726] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29706.83 | forward: 145497.11 | backward_microstep: 390379.40 | backward: 390365.70 | backward_inner_microstep: 390348.15 | backward_inner: 390341.34 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.15 | reduce_grads: 0.19 | step: 352.22 | _step_clipping: 0.13 | _step_step: 350.52 | _step_zero_grad: 0.50 | _step_check_overflow: 0.48 samples/sec: 16.057 | iteration 16910/ 143000 | elapsed time per iteration (ms): 63772.7 | learning rate: 5.829E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.394371E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 06:14:17,159] [INFO] [logging.py:60:log_dist] [Rank 0] step=16920, skipped=16, lr=[0.0005828312933796202, 0.0005828312933796202], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16920 loss: 2.3941 iter time (s): 63.043 samples/sec: 16.243 %comms: 0.0028710710008188903 %optimizer_step 0.058822975547949755 %forward: 23.10204582455245 %backward: 61.954200896406554 [2025-04-06 06:14:17,159] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21973.58 | forward: 145641.67 | backward_microstep: 390590.81 | backward: 390576.37 | backward_inner_microstep: 390559.05 | backward_inner: 390552.47 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.84 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.21 | step: 370.84 | _step_clipping: 0.13 | _step_step: 368.97 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.243 | iteration 16920/ 143000 | elapsed time per iteration (ms): 63043.4 | learning rate: 5.828E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.385979E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 06:24:52,774] [INFO] [logging.py:60:log_dist] [Rank 0] step=16930, skipped=16, lr=[0.000582809310280714, 0.000582809310280714], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16930 loss: 2.4009 iter time (s): 63.561 samples/sec: 16.111 %comms: 0.0028602305519122135 %optimizer_step 0.056131705744267124 %forward: 22.897196417332196 %backward: 61.41540309352727 [2025-04-06 06:24:52,774] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27550.45 | forward: 145536.73 | backward_microstep: 390373.67 | backward: 390362.06 | backward_inner_microstep: 390344.64 | backward_inner: 390337.93 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.18 | reduce_grads: 0.20 | step: 356.78 | _step_clipping: 0.13 | _step_step: 354.98 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.110 | iteration 16930/ 143000 | elapsed time per iteration (ms): 63561.5 | learning rate: 5.828E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.396253E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 06:35:25,086] [INFO] [logging.py:60:log_dist] [Rank 0] step=16940, skipped=16, lr=[0.0005827873135321623, 0.0005827873135321623], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16940 loss: 2.3852 iter time (s): 63.231 samples/sec: 16.195 %comms: 0.0028974596856377616 %optimizer_step 0.057323421151261746 %forward: 23.047026107414517 %backward: 61.763067112011505 [2025-04-06 06:35:25,086] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23861.97 | forward: 145727.63 | backward_microstep: 390547.52 | backward: 390531.31 | backward_inner_microstep: 390513.67 | backward_inner: 390506.59 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.82 | reduce_tied_grads: 0.33 | comms: 18.32 | reduce_grads: 0.20 | step: 362.46 | _step_clipping: 0.13 | _step_step: 360.59 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 16.195 | iteration 16940/ 143000 | elapsed time per iteration (ms): 63231.2 | learning rate: 5.828E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.386643E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 06:45:55,371] [INFO] [logging.py:60:log_dist] [Rank 0] step=16950, skipped=16, lr=[0.0005827653031350269, 0.0005827653031350269], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16950 loss: 2.3821 iter time (s): 63.028 samples/sec: 16.247 %comms: 0.002933442051504386 %optimizer_step 0.05777838317311638 %forward: 23.103635488671937 %backward: 61.97003419493215 [2025-04-06 06:45:55,372] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21916.34 | forward: 145617.48 | backward_microstep: 390597.76 | backward: 390584.42 | backward_inner_microstep: 390564.40 | backward_inner: 390555.58 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.90 | reduce_tied_grads: 0.35 | comms: 18.49 | reduce_grads: 0.21 | step: 364.17 | _step_clipping: 0.13 | _step_step: 362.18 | _step_zero_grad: 0.57 | _step_check_overflow: 0.65 samples/sec: 16.247 | iteration 16950/ 143000 | elapsed time per iteration (ms): 63028.6 | learning rate: 5.828E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.392863E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 06:56:28,795] [INFO] [logging.py:60:log_dist] [Rank 0] step=16960, skipped=16, lr=[0.0005827432790903699, 0.0005827432790903699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16960 loss: 2.4117 iter time (s): 63.342 samples/sec: 16.166 %comms: 0.0029098773655995367 %optimizer_step 0.05718499219464204 %forward: 22.97150174631938 %backward: 61.62036723606905 [2025-04-06 06:56:28,796] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25456.15 | forward: 145505.44 | backward_microstep: 390327.34 | backward: 390314.00 | backward_inner_microstep: 390296.38 | backward_inner: 390289.69 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.82 | reduce_tied_grads: 0.36 | comms: 18.43 | reduce_grads: 0.21 | step: 362.22 | _step_clipping: 0.15 | _step_step: 360.25 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.166 | iteration 16960/ 143000 | elapsed time per iteration (ms): 63342.4 | learning rate: 5.827E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.386867E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 07:07:06,236] [INFO] [logging.py:60:log_dist] [Rank 0] step=16970, skipped=16, lr=[0.0005827212413992544, 0.0005827212413992544], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16970 loss: 2.3919 iter time (s): 63.744 samples/sec: 16.064 %comms: 0.002856299867120379 %optimizer_step 0.05660692171246891 %forward: 22.79781199007075 %backward: 61.20567797853425 [2025-04-06 07:07:06,237] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29880.17 | forward: 145321.38 | backward_microstep: 390159.25 | backward: 390146.80 | backward_inner_microstep: 390129.06 | backward_inner: 390122.42 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.92 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.21 | step: 360.83 | _step_clipping: 0.13 | _step_step: 358.86 | _step_zero_grad: 0.56 | _step_check_overflow: 0.64 samples/sec: 16.064 | iteration 16970/ 143000 | elapsed time per iteration (ms): 63744.2 | learning rate: 5.827E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.394660E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 07:17:39,688] [INFO] [logging.py:60:log_dist] [Rank 0] step=16980, skipped=16, lr=[0.0005826991900627441, 0.0005826991900627441], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16980 loss: 2.3817 iter time (s): 63.345 samples/sec: 16.166 %comms: 0.0029355672990693275 %optimizer_step 0.05746355862468287 %forward: 22.986467914736462 %backward: 61.620205165930585 [2025-04-06 07:17:39,689] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25356.77 | forward: 145606.74 | backward_microstep: 390342.92 | backward: 390330.41 | backward_inner_microstep: 390312.06 | backward_inner: 390305.01 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.00 | reduce_tied_grads: 0.32 | comms: 18.60 | reduce_grads: 0.21 | step: 364.00 | _step_clipping: 0.16 | _step_step: 361.86 | _step_zero_grad: 0.60 | _step_check_overflow: 0.71 samples/sec: 16.165 | iteration 16980/ 143000 | elapsed time per iteration (ms): 63345.2 | learning rate: 5.827E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.389294E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 07:28:15,212] [INFO] [logging.py:60:log_dist] [Rank 0] step=16990, skipped=16, lr=[0.0005826771250819033, 0.0005826771250819033], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 16990 loss: 2.3881 iter time (s): 63.552 samples/sec: 16.113 %comms: 0.002845486716993353 %optimizer_step 0.05564050500475843 %forward: 22.91484952991922 %backward: 61.42498055902648 [2025-04-06 07:28:15,213] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27387.20 | forward: 145627.93 | backward_microstep: 390379.77 | backward: 390366.64 | backward_inner_microstep: 390349.18 | backward_inner: 390342.27 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.89 | reduce_tied_grads: 0.30 | comms: 18.08 | reduce_grads: 0.21 | step: 353.61 | _step_clipping: 0.12 | _step_step: 351.83 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.113 | iteration 16990/ 143000 | elapsed time per iteration (ms): 63552.4 | learning rate: 5.827E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.388956E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 07:38:44,144] [INFO] [logging.py:60:log_dist] [Rank 0] step=17000, skipped=16, lr=[0.0005826550464577967, 0.0005826550464577967], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17000 loss: 2.3783 iter time (s): 62.893 samples/sec: 16.282 %comms: 0.002875611183377446 %optimizer_step 0.057276392920188254 %forward: 23.107453324284467 %backward: 62.02473019249023 [2025-04-06 07:38:44,145] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21404.71 | forward: 145328.89 | backward_microstep: 390098.80 | backward: 390089.94 | backward_inner_microstep: 390072.75 | backward_inner: 390066.08 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.86 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.20 | step: 360.23 | _step_clipping: 0.14 | _step_step: 358.24 | _step_zero_grad: 0.51 | _step_check_overflow: 0.73 samples/sec: 16.282 | iteration 17000/ 143000 | elapsed time per iteration (ms): 62893.2 | learning rate: 5.827E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.388667E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 07:38:46,997] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step17000/mp_rank_00_model_states.pt [2025-04-06 07:39:01,507] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-06 07:39:01,512] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step17000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-06 07:49:37,823] [INFO] [logging.py:60:log_dist] [Rank 0] step=17010, skipped=16, lr=[0.0005826329541914901, 0.0005826329541914901], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17010 loss: 2.4112 iter time (s): 63.630 samples/sec: 16.093 %comms: 0.002820424371972695 %optimizer_step 0.0576379829346917 %forward: 22.895610556336287 %backward: 61.364401228048116 [2025-04-06 07:49:37,823] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27976.36 | forward: 145683.82 | backward_microstep: 390475.19 | backward: 390459.14 | backward_inner_microstep: 390442.04 | backward_inner: 390435.15 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 366.75 | _step_clipping: 0.13 | _step_step: 364.95 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 15.665 | iteration 17010/ 143000 | elapsed time per iteration (ms): 65367.8 | learning rate: 5.826E-04 | approx flops per GPU: 67.6TFLOPS | lm_loss: 2.397490E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 08:00:14,634] [INFO] [logging.py:60:log_dist] [Rank 0] step=17020, skipped=16, lr=[0.0005826108482840498, 0.0005826108482840498], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17020 loss: 2.4002 iter time (s): 63.681 samples/sec: 16.080 %comms: 0.0028455723537204177 %optimizer_step 0.05886550360150398 %forward: 22.862180606176242 %backward: 61.317409942147435 [2025-04-06 08:00:14,635] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28561.77 | forward: 145587.66 | backward_microstep: 390485.27 | backward: 390472.73 | backward_inner_microstep: 390454.21 | backward_inner: 390447.31 | backward_allreduce_microstep: 8.73 | backward_allreduce: 2.99 | reduce_tied_grads: 0.37 | comms: 18.12 | reduce_grads: 0.22 | step: 374.86 | _step_clipping: 0.15 | _step_step: 372.77 | _step_zero_grad: 0.59 | _step_check_overflow: 0.71 samples/sec: 16.080 | iteration 17020/ 143000 | elapsed time per iteration (ms): 63681.2 | learning rate: 5.826E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.389910E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 08:10:51,247] [INFO] [logging.py:60:log_dist] [Rank 0] step=17030, skipped=16, lr=[0.0005825887287365427, 0.0005825887287365427], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17030 loss: 2.3814 iter time (s): 63.661 samples/sec: 16.085 %comms: 0.0028547038596883393 %optimizer_step 0.05618243254965819 %forward: 22.88438627824705 %backward: 61.31845769612968 [2025-04-06 08:10:51,247] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28358.88 | forward: 145683.39 | backward_microstep: 390369.95 | backward: 390357.02 | backward_inner_microstep: 390339.34 | backward_inner: 390332.54 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.85 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.20 | step: 357.66 | _step_clipping: 0.14 | _step_step: 355.86 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.085 | iteration 17030/ 143000 | elapsed time per iteration (ms): 63661.2 | learning rate: 5.826E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.392726E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 08:21:25,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=17040, skipped=16, lr=[0.0005825665955500364, 0.0005825665955500364], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17040 loss: 2.3815 iter time (s): 63.456 samples/sec: 16.137 %comms: 0.0028573571812063273 %optimizer_step 0.055883631925943936 %forward: 22.942264175421116 %backward: 61.5056543352679 [2025-04-06 08:21:25,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26563.80 | forward: 145583.13 | backward_microstep: 390305.75 | backward: 390292.16 | backward_inner_microstep: 390274.98 | backward_inner: 390268.31 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.19 | step: 354.62 | _step_clipping: 0.12 | _step_step: 352.77 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 16.137 | iteration 17040/ 143000 | elapsed time per iteration (ms): 63456.9 | learning rate: 5.826E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.385689E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 08:32:04,677] [INFO] [logging.py:60:log_dist] [Rank 0] step=17050, skipped=16, lr=[0.0005825444487255992, 0.0005825444487255992], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17050 loss: 2.3857 iter time (s): 63.886 samples/sec: 16.029 %comms: 0.0028226354981457803 %optimizer_step 0.05798632425226699 %forward: 22.7996695513894 %backward: 61.07889467444948 [2025-04-06 08:32:04,677] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30861.47 | forward: 145656.85 | backward_microstep: 390216.49 | backward: 390205.63 | backward_inner_microstep: 390186.35 | backward_inner: 390179.46 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 18.03 | reduce_grads: 0.20 | step: 370.45 | _step_clipping: 0.14 | _step_step: 368.68 | _step_zero_grad: 0.53 | _step_check_overflow: 0.49 samples/sec: 16.029 | iteration 17050/ 143000 | elapsed time per iteration (ms): 63886.1 | learning rate: 5.825E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.392358E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 08:42:35,185] [INFO] [logging.py:60:log_dist] [Rank 0] step=17060, skipped=16, lr=[0.0005825222882642998, 0.0005825222882642998], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17060 loss: 2.3812 iter time (s): 63.050 samples/sec: 16.241 %comms: 0.0029149334181935945 %optimizer_step 0.05795081697044166 %forward: 23.09949079815935 %backward: 61.89588886447845 [2025-04-06 08:42:35,186] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22432.12 | forward: 145642.92 | backward_microstep: 390270.33 | backward: 390255.26 | backward_inner_microstep: 390234.50 | backward_inner: 390227.75 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.38 | reduce_grads: 0.22 | step: 365.38 | _step_clipping: 0.13 | _step_step: 363.56 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 16.241 | iteration 17060/ 143000 | elapsed time per iteration (ms): 63050.9 | learning rate: 5.825E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.380134E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 08:53:14,689] [INFO] [logging.py:60:log_dist] [Rank 0] step=17070, skipped=16, lr=[0.0005825001141672079, 0.0005825001141672079], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17070 loss: 2.3972 iter time (s): 63.950 samples/sec: 16.013 %comms: 0.002840566813989239 %optimizer_step 0.05608746541235404 %forward: 22.781059856566994 %backward: 61.03332254646686 [2025-04-06 08:53:14,690] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31362.81 | forward: 145684.28 | backward_microstep: 390319.16 | backward: 390306.49 | backward_inner_microstep: 390288.14 | backward_inner: 390281.07 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.05 | reduce_tied_grads: 0.35 | comms: 18.17 | reduce_grads: 0.23 | step: 358.68 | _step_clipping: 0.14 | _step_step: 356.82 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.012 | iteration 17070/ 143000 | elapsed time per iteration (ms): 63950.3 | learning rate: 5.825E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.388005E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 09:03:43,201] [INFO] [logging.py:60:log_dist] [Rank 0] step=17080, skipped=16, lr=[0.0005824779264353939, 0.0005824779264353939], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17080 loss: 2.3780 iter time (s): 62.851 samples/sec: 16.293 %comms: 0.002896766609407574 %optimizer_step 0.05687446534165917 %forward: 23.161870732430195 %backward: 62.109557433102815 [2025-04-06 09:03:43,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20446.72 | forward: 145573.80 | backward_microstep: 390376.18 | backward: 390362.42 | backward_inner_microstep: 390344.83 | backward_inner: 390337.97 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.21 | reduce_grads: 0.19 | step: 357.46 | _step_clipping: 0.13 | _step_step: 355.65 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.292 | iteration 17080/ 143000 | elapsed time per iteration (ms): 62851.2 | learning rate: 5.825E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.387292E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 09:14:18,827] [INFO] [logging.py:60:log_dist] [Rank 0] step=17090, skipped=16, lr=[0.0005824557250699283, 0.0005824557250699283], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17090 loss: 2.3952 iter time (s): 63.562 samples/sec: 16.110 %comms: 0.0028744757674209636 %optimizer_step 0.05640672474935436 %forward: 22.930479848757194 %backward: 61.443758250872236 [2025-04-06 09:14:18,827] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27152.35 | forward: 145750.62 | backward_microstep: 390564.93 | backward: 390548.56 | backward_inner_microstep: 390528.61 | backward_inner: 390521.42 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.98 | reduce_tied_grads: 0.32 | comms: 18.27 | reduce_grads: 0.21 | step: 358.53 | _step_clipping: 0.15 | _step_step: 356.72 | _step_zero_grad: 0.53 | _step_check_overflow: 0.52 samples/sec: 16.110 | iteration 17090/ 143000 | elapsed time per iteration (ms): 63562.5 | learning rate: 5.825E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.383145E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 09:24:49,846] [INFO] [logging.py:60:log_dist] [Rank 0] step=17100, skipped=16, lr=[0.000582433510071883, 0.000582433510071883], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17100 loss: 2.3994 iter time (s): 63.101 samples/sec: 16.228 %comms: 0.0028729416596909346 %optimizer_step 0.05678201790718135 %forward: 23.057994785283494 %backward: 61.849131778890595 [2025-04-06 09:24:49,846] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23128.72 | forward: 145498.93 | backward_microstep: 390287.78 | backward: 390276.02 | backward_inner_microstep: 390258.51 | backward_inner: 390250.02 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.90 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.23 | step: 358.30 | _step_clipping: 0.13 | _step_step: 356.37 | _step_zero_grad: 0.56 | _step_check_overflow: 0.61 samples/sec: 16.228 | iteration 17100/ 143000 | elapsed time per iteration (ms): 63101.9 | learning rate: 5.824E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.395113E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 09:35:30,916] [INFO] [logging.py:60:log_dist] [Rank 0] step=17110, skipped=16, lr=[0.00058241128144233, 0.00058241128144233], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17110 loss: 2.3972 iter time (s): 64.106 samples/sec: 15.973 %comms: 0.002832618350222942 %optimizer_step 0.05813279353206997 %forward: 22.73745248961826 %backward: 60.88154973241694 [2025-04-06 09:35:30,917] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32870.00 | forward: 145761.75 | backward_microstep: 390301.23 | backward: 390290.05 | backward_inner_microstep: 390272.24 | backward_inner: 390265.42 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.96 | reduce_tied_grads: 0.37 | comms: 18.16 | reduce_grads: 0.22 | step: 372.67 | _step_clipping: 0.14 | _step_step: 370.63 | _step_zero_grad: 0.51 | _step_check_overflow: 0.76 samples/sec: 15.973 | iteration 17110/ 143000 | elapsed time per iteration (ms): 64107.1 | learning rate: 5.824E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.391575E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 09:46:06,618] [INFO] [logging.py:60:log_dist] [Rank 0] step=17120, skipped=16, lr=[0.000582389039182342, 0.000582389039182342], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17120 loss: 2.3786 iter time (s): 63.570 samples/sec: 16.108 %comms: 0.0028943870249039908 %optimizer_step 0.058926371364842356 %forward: 22.931155112664005 %backward: 61.40538930453429 [2025-04-06 09:46:06,619] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27424.55 | forward: 145772.23 | backward_microstep: 390366.27 | backward: 390351.06 | backward_inner_microstep: 390332.60 | backward_inner: 390325.68 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.00 | reduce_tied_grads: 0.39 | comms: 18.40 | reduce_grads: 0.26 | step: 374.59 | _step_clipping: 0.14 | _step_step: 372.55 | _step_zero_grad: 0.60 | _step_check_overflow: 0.64 samples/sec: 16.108 | iteration 17120/ 143000 | elapsed time per iteration (ms): 63570.2 | learning rate: 5.824E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.383749E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 09:56:38,677] [INFO] [logging.py:60:log_dist] [Rank 0] step=17130, skipped=16, lr=[0.0005823667832929928, 0.0005823667832929928], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17130 loss: 2.4059 iter time (s): 63.205 samples/sec: 16.201 %comms: 0.0029611976574374305 %optimizer_step 0.059350508117293393 %forward: 23.04993540615545 %backward: 61.81143549292538 [2025-04-06 09:56:38,678] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23471.28 | forward: 145687.78 | backward_microstep: 390699.89 | backward: 390680.96 | backward_inner_microstep: 390661.25 | backward_inner: 390653.77 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.88 | reduce_tied_grads: 0.35 | comms: 18.72 | reduce_grads: 0.22 | step: 375.13 | _step_clipping: 0.14 | _step_step: 373.07 | _step_zero_grad: 0.55 | _step_check_overflow: 0.69 samples/sec: 16.201 | iteration 17130/ 143000 | elapsed time per iteration (ms): 63205.9 | learning rate: 5.824E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.391786E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 10:07:11,806] [INFO] [logging.py:60:log_dist] [Rank 0] step=17140, skipped=16, lr=[0.0005823445137753566, 0.0005823445137753566], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17140 loss: 2.3808 iter time (s): 63.312 samples/sec: 16.174 %comms: 0.002845028142543901 %optimizer_step 0.0566122942813059 %forward: 22.984109800447506 %backward: 61.656465536771364 [2025-04-06 10:07:11,806] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25079.49 | forward: 145517.65 | backward_microstep: 390372.51 | backward: 390361.18 | backward_inner_microstep: 390344.08 | backward_inner: 390337.57 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.22 | step: 358.43 | _step_clipping: 0.14 | _step_step: 356.62 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.174 | iteration 17140/ 143000 | elapsed time per iteration (ms): 63312.9 | learning rate: 5.823E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.378264E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 10:17:47,146] [INFO] [logging.py:60:log_dist] [Rank 0] step=17150, skipped=16, lr=[0.000582322230630508, 0.000582322230630508], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17150 loss: 2.3861 iter time (s): 63.533 samples/sec: 16.118 %comms: 0.002896441972636 %optimizer_step 0.05647575872739916 %forward: 22.931133949406814 %backward: 61.459985855292445 [2025-04-06 10:17:47,147] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26972.87 | forward: 145687.55 | backward_microstep: 390486.24 | backward: 390471.53 | backward_inner_microstep: 390453.55 | backward_inner: 390446.43 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.93 | reduce_tied_grads: 0.33 | comms: 18.40 | reduce_grads: 0.22 | step: 358.81 | _step_clipping: 0.12 | _step_step: 356.98 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.117 | iteration 17150/ 143000 | elapsed time per iteration (ms): 63534.0 | learning rate: 5.823E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.381507E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 10:28:15,585] [INFO] [logging.py:60:log_dist] [Rank 0] step=17160, skipped=16, lr=[0.0005822999338595226, 0.0005822999338595226], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17160 loss: 2.3849 iter time (s): 62.843 samples/sec: 16.294 %comms: 0.0028853029804315924 %optimizer_step 0.05576952401018408 %forward: 23.187989352181862 %backward: 62.1427620257511 [2025-04-06 10:28:15,586] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20012.89 | forward: 145721.09 | backward_microstep: 390538.78 | backward: 390525.92 | backward_inner_microstep: 390508.27 | backward_inner: 390501.50 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.94 | reduce_tied_grads: 0.32 | comms: 18.13 | reduce_grads: 0.21 | step: 350.47 | _step_clipping: 0.13 | _step_step: 348.56 | _step_zero_grad: 0.47 | _step_check_overflow: 0.70 samples/sec: 16.294 | iteration 17160/ 143000 | elapsed time per iteration (ms): 62843.9 | learning rate: 5.823E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.392570E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 10:38:49,050] [INFO] [logging.py:60:log_dist] [Rank 0] step=17170, skipped=16, lr=[0.0005822776234634765, 0.0005822776234634765], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17170 loss: 2.3881 iter time (s): 63.346 samples/sec: 16.165 %comms: 0.0028565428676470043 %optimizer_step 0.057204852863609865 %forward: 23.013674344401835 %backward: 61.61594806919485 [2025-04-06 10:38:49,051] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25215.17 | forward: 145782.10 | backward_microstep: 390323.60 | backward: 390311.52 | backward_inner_microstep: 390293.46 | backward_inner: 390286.50 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.95 | reduce_tied_grads: 0.36 | comms: 18.10 | reduce_grads: 0.23 | step: 362.37 | _step_clipping: 0.14 | _step_step: 360.39 | _step_zero_grad: 0.60 | _step_check_overflow: 0.60 samples/sec: 16.165 | iteration 17170/ 143000 | elapsed time per iteration (ms): 63346.5 | learning rate: 5.823E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.393340E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 10:49:32,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=17180, skipped=16, lr=[0.0005822552994434464, 0.0005822552994434464], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17180 loss: 2.3942 iter time (s): 64.379 samples/sec: 15.906 %comms: 0.0028140817012743628 %optimizer_step 0.055833340778038004 %forward: 22.633968947497422 %backward: 60.65751329478064 [2025-04-06 10:49:32,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35378.93 | forward: 145714.71 | backward_microstep: 390520.29 | backward: 390505.63 | backward_inner_microstep: 390487.59 | backward_inner: 390480.59 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.92 | reduce_tied_grads: 0.32 | comms: 18.12 | reduce_grads: 0.20 | step: 359.45 | _step_clipping: 0.12 | _step_step: 357.62 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 15.906 | iteration 17180/ 143000 | elapsed time per iteration (ms): 64379.3 | learning rate: 5.823E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.386665E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 11:00:08,636] [INFO] [logging.py:60:log_dist] [Rank 0] step=17190, skipped=16, lr=[0.00058223296180051, 0.00058223296180051], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17190 loss: 2.4010 iter time (s): 63.579 samples/sec: 16.106 %comms: 0.0028417375538999485 %optimizer_step 0.05682871360444158 %forward: 22.8883723046514 %backward: 61.383790041845565 [2025-04-06 11:00:08,636] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27836.07 | forward: 145520.99 | backward_microstep: 390280.89 | backward: 390269.34 | backward_inner_microstep: 390252.43 | backward_inner: 390245.73 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.76 | reduce_tied_grads: 0.32 | comms: 18.07 | reduce_grads: 0.21 | step: 361.31 | _step_clipping: 0.13 | _step_step: 359.40 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 16.106 | iteration 17190/ 143000 | elapsed time per iteration (ms): 63579.2 | learning rate: 5.822E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.384385E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 11:10:46,582] [INFO] [logging.py:60:log_dist] [Rank 0] step=17200, skipped=16, lr=[0.0005822106105357454, 0.0005822106105357454], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17200 loss: 2.3913 iter time (s): 63.794 samples/sec: 16.052 %comms: 0.0028380439679542356 %optimizer_step 0.05813553938738797 %forward: 22.847037657457445 %backward: 61.19865558742558 [2025-04-06 11:10:46,583] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29587.44 | forward: 145750.49 | backward_microstep: 390421.95 | backward: 390410.97 | backward_inner_microstep: 390392.76 | backward_inner: 390385.79 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.07 | reduce_tied_grads: 0.36 | comms: 18.11 | reduce_grads: 0.23 | step: 370.87 | _step_clipping: 0.15 | _step_step: 368.98 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.051 | iteration 17200/ 143000 | elapsed time per iteration (ms): 63794.7 | learning rate: 5.822E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.388597E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 11:21:17,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=17210, skipped=16, lr=[0.0005821882456502311, 0.0005821882456502311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17210 loss: 2.3841 iter time (s): 63.056 samples/sec: 16.240 %comms: 0.002905070370234729 %optimizer_step 0.05810828894893895 %forward: 23.09572831010862 %backward: 61.95352765296055 [2025-04-06 11:21:17,148] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22057.30 | forward: 145632.14 | backward_microstep: 390665.67 | backward: 390653.41 | backward_inner_microstep: 390633.73 | backward_inner: 390625.21 | backward_allreduce_microstep: 10.04 | backward_allreduce: 4.71 | reduce_tied_grads: 0.35 | comms: 18.32 | reduce_grads: 0.20 | step: 366.41 | _step_clipping: 0.14 | _step_step: 364.29 | _step_zero_grad: 0.55 | _step_check_overflow: 0.80 samples/sec: 16.239 | iteration 17210/ 143000 | elapsed time per iteration (ms): 63056.5 | learning rate: 5.822E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.385485E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 11:31:50,542] [INFO] [logging.py:60:log_dist] [Rank 0] step=17220, skipped=16, lr=[0.0005821658671450468, 0.0005821658671450468], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17220 loss: 2.3751 iter time (s): 63.339 samples/sec: 16.167 %comms: 0.0028669823148510503 %optimizer_step 0.05669510696560638 %forward: 22.97460890999819 %backward: 61.644827883877696 [2025-04-06 11:31:50,543] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25242.34 | forward: 145518.65 | backward_microstep: 390461.63 | backward: 390451.57 | backward_inner_microstep: 390434.28 | backward_inner: 390427.82 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.85 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.21 | step: 359.10 | _step_clipping: 0.14 | _step_step: 357.26 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.167 | iteration 17220/ 143000 | elapsed time per iteration (ms): 63339.5 | learning rate: 5.822E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.377949E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 11:42:27,886] [INFO] [logging.py:60:log_dist] [Rank 0] step=17230, skipped=16, lr=[0.0005821434750212723, 0.0005821434750212723], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17230 loss: 2.4138 iter time (s): 63.734 samples/sec: 16.067 %comms: 0.0028613024670003004 %optimizer_step 0.05892170249114331 %forward: 22.879920063750184 %backward: 61.27546815303674 [2025-04-06 11:42:27,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28787.11 | forward: 145822.36 | backward_microstep: 390545.26 | backward: 390531.68 | backward_inner_microstep: 390514.64 | backward_inner: 390508.03 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.78 | reduce_tied_grads: 0.34 | comms: 18.24 | reduce_grads: 0.23 | step: 375.53 | _step_clipping: 0.13 | _step_step: 373.70 | _step_zero_grad: 0.55 | _step_check_overflow: 0.50 samples/sec: 16.067 | iteration 17230/ 143000 | elapsed time per iteration (ms): 63734.4 | learning rate: 5.821E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.402073E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 11:53:03,354] [INFO] [logging.py:60:log_dist] [Rank 0] step=17240, skipped=16, lr=[0.0005821210692799888, 0.0005821210692799888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17240 loss: 2.3735 iter time (s): 63.546 samples/sec: 16.114 %comms: 0.0028972137513362498 %optimizer_step 0.05863770036410128 %forward: 22.914934486834714 %backward: 61.45483136795124 [2025-04-06 11:53:03,354] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27125.55 | forward: 145615.62 | backward_microstep: 390535.14 | backward: 390521.89 | backward_inner_microstep: 390504.31 | backward_inner: 390497.48 | backward_allreduce_microstep: 8.47 | backward_allreduce: 3.16 | reduce_tied_grads: 0.35 | comms: 18.41 | reduce_grads: 0.25 | step: 372.62 | _step_clipping: 0.14 | _step_step: 370.67 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.114 | iteration 17240/ 143000 | elapsed time per iteration (ms): 63546.8 | learning rate: 5.821E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.400483E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 12:03:39,971] [INFO] [logging.py:60:log_dist] [Rank 0] step=17250, skipped=16, lr=[0.0005820986499222772, 0.0005820986499222772], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17250 loss: 2.3577 iter time (s): 63.661 samples/sec: 16.085 %comms: 0.002858617333636013 %optimizer_step 0.05673109591056041 %forward: 22.900897703028786 %backward: 61.3605172310724 [2025-04-06 12:03:39,971] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27959.64 | forward: 145789.47 | backward_microstep: 390641.57 | backward: 390627.36 | backward_inner_microstep: 390609.28 | backward_inner: 390602.31 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.22 | step: 361.16 | _step_clipping: 0.15 | _step_step: 359.34 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.085 | iteration 17250/ 143000 | elapsed time per iteration (ms): 63661.7 | learning rate: 5.821E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.380448E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 12:14:16,803] [INFO] [logging.py:60:log_dist] [Rank 0] step=17260, skipped=16, lr=[0.0005820762169492198, 0.0005820762169492198], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17260 loss: 2.3912 iter time (s): 63.683 samples/sec: 16.080 %comms: 0.0028273201149418333 %optimizer_step 0.056508177562845 %forward: 22.889539068061833 %backward: 61.31905316948573 [2025-04-06 12:14:16,804] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28381.07 | forward: 145766.72 | backward_microstep: 390509.73 | backward: 390496.17 | backward_inner_microstep: 390478.35 | backward_inner: 390471.58 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 18.01 | reduce_grads: 0.19 | step: 359.86 | _step_clipping: 0.13 | _step_step: 358.04 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.080 | iteration 17260/ 143000 | elapsed time per iteration (ms): 63683.3 | learning rate: 5.821E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.395360E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 12:24:59,813] [INFO] [logging.py:60:log_dist] [Rank 0] step=17270, skipped=16, lr=[0.0005820537703618994, 0.0005820537703618994], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17270 loss: 2.3600 iter time (s): 64.300 samples/sec: 15.925 %comms: 0.002834347149805149 %optimizer_step 0.05721001484553327 %forward: 22.700001594021323 %backward: 60.7538172073864 [2025-04-06 12:24:59,813] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34165.36 | forward: 145961.83 | backward_microstep: 390664.61 | backward: 390649.24 | backward_inner_microstep: 390631.33 | backward_inner: 390624.52 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.22 | step: 367.86 | _step_clipping: 0.14 | _step_step: 366.07 | _step_zero_grad: 0.54 | _step_check_overflow: 0.50 samples/sec: 15.925 | iteration 17270/ 143000 | elapsed time per iteration (ms): 64300.9 | learning rate: 5.821E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.388723E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 12:35:29,949] [INFO] [logging.py:60:log_dist] [Rank 0] step=17280, skipped=16, lr=[0.0005820313101613993, 0.0005820313101613993], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17280 loss: 2.3787 iter time (s): 63.013 samples/sec: 16.251 %comms: 0.0028976606508748478 %optimizer_step 0.05674305452149804 %forward: 23.109877758751413 %backward: 61.99452804497441 [2025-04-06 12:35:29,950] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21631.21 | forward: 145622.43 | backward_microstep: 390661.83 | backward: 390646.53 | backward_inner_microstep: 390629.41 | backward_inner: 390622.60 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.72 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.20 | step: 357.56 | _step_clipping: 0.14 | _step_step: 355.51 | _step_zero_grad: 0.58 | _step_check_overflow: 0.70 samples/sec: 16.250 | iteration 17280/ 143000 | elapsed time per iteration (ms): 63013.7 | learning rate: 5.820E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.381054E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 12:46:11,285] [INFO] [logging.py:60:log_dist] [Rank 0] step=17290, skipped=16, lr=[0.0005820088363488034, 0.0005820088363488034], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17290 loss: 2.3845 iter time (s): 64.133 samples/sec: 15.967 %comms: 0.0028091806929209226 %optimizer_step 0.05538987579218998 %forward: 22.742600556529087 %backward: 60.88480977487488 [2025-04-06 12:46:11,285] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32817.72 | forward: 145854.97 | backward_microstep: 390486.53 | backward: 390472.15 | backward_inner_microstep: 390454.52 | backward_inner: 390447.68 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.19 | step: 355.23 | _step_clipping: 0.14 | _step_step: 353.54 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 15.967 | iteration 17290/ 143000 | elapsed time per iteration (ms): 64133.5 | learning rate: 5.820E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.384544E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 12:56:48,490] [INFO] [logging.py:60:log_dist] [Rank 0] step=17300, skipped=16, lr=[0.0005819863489251966, 0.0005819863489251966], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17300 loss: 2.3980 iter time (s): 63.720 samples/sec: 16.070 %comms: 0.0028423905805234073 %optimizer_step 0.06149570353465288 %forward: 22.862005747075333 %backward: 61.27521995660244 [2025-04-06 12:56:48,491] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28867.40 | forward: 145676.64 | backward_microstep: 390458.22 | backward: 390445.55 | backward_inner_microstep: 390427.81 | backward_inner: 390421.00 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.22 | step: 391.85 | _step_clipping: 0.13 | _step_step: 389.82 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.070 | iteration 17300/ 143000 | elapsed time per iteration (ms): 63720.5 | learning rate: 5.820E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.379431E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 13:07:23,760] [INFO] [logging.py:60:log_dist] [Rank 0] step=17310, skipped=16, lr=[0.000581963847891664, 0.000581963847891664], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17310 loss: 2.3741 iter time (s): 63.526 samples/sec: 16.119 %comms: 0.0028266931251078016 %optimizer_step 0.055944706950581465 %forward: 22.895157301208936 %backward: 61.41955895226179 [2025-04-06 13:07:23,761] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27483.01 | forward: 145444.76 | backward_microstep: 390186.32 | backward: 390176.53 | backward_inner_microstep: 390158.75 | backward_inner: 390151.97 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.96 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.20 | step: 355.40 | _step_clipping: 0.12 | _step_step: 353.64 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.119 | iteration 17310/ 143000 | elapsed time per iteration (ms): 63527.0 | learning rate: 5.820E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.385507E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 13:18:04,519] [INFO] [logging.py:60:log_dist] [Rank 0] step=17320, skipped=16, lr=[0.0005819413332492919, 0.0005819413332492919], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17320 loss: 2.3845 iter time (s): 64.075 samples/sec: 15.981 %comms: 0.002890628596378846 %optimizer_step 0.05667680039505024 %forward: 22.71589167167401 %backward: 60.8909010208032 [2025-04-06 13:18:04,519] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32878.49 | forward: 145552.73 | backward_microstep: 390172.63 | backward: 390160.20 | backward_inner_microstep: 390140.14 | backward_inner: 390133.33 | backward_allreduce_microstep: 10.66 | backward_allreduce: 3.21 | reduce_tied_grads: 0.33 | comms: 18.52 | reduce_grads: 0.22 | step: 363.16 | _step_clipping: 0.15 | _step_step: 361.20 | _step_zero_grad: 0.55 | _step_check_overflow: 0.63 samples/sec: 15.981 | iteration 17320/ 143000 | elapsed time per iteration (ms): 64075.9 | learning rate: 5.819E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.382821E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 13:28:42,575] [INFO] [logging.py:60:log_dist] [Rank 0] step=17330, skipped=16, lr=[0.0005819188049991666, 0.0005819188049991666], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17330 loss: 2.3887 iter time (s): 63.805 samples/sec: 16.049 %comms: 0.0028294123971319986 %optimizer_step 0.05564238270216714 %forward: 22.846212851572396 %backward: 61.21018340885646 [2025-04-06 13:28:42,575] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29506.21 | forward: 145770.17 | backward_microstep: 390569.86 | backward: 390551.34 | backward_inner_microstep: 390533.17 | backward_inner: 390526.06 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.21 | step: 355.03 | _step_clipping: 0.13 | _step_step: 353.28 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.049 | iteration 17330/ 143000 | elapsed time per iteration (ms): 63805.6 | learning rate: 5.819E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.387899E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 13:39:18,642] [INFO] [logging.py:60:log_dist] [Rank 0] step=17340, skipped=16, lr=[0.0005818962631423757, 0.0005818962631423757], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17340 loss: 2.3962 iter time (s): 63.606 samples/sec: 16.099 %comms: 0.0029129635201380335 %optimizer_step 0.05571327139129109 %forward: 22.899563513109335 %backward: 61.4123106132483 [2025-04-06 13:39:18,643] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27572.41 | forward: 145655.18 | backward_microstep: 390637.88 | backward: 390619.72 | backward_inner_microstep: 390602.08 | backward_inner: 390594.91 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.81 | reduce_tied_grads: 0.34 | comms: 18.53 | reduce_grads: 0.22 | step: 354.37 | _step_clipping: 0.14 | _step_step: 352.34 | _step_zero_grad: 0.57 | _step_check_overflow: 0.65 samples/sec: 16.099 | iteration 17340/ 143000 | elapsed time per iteration (ms): 63606.7 | learning rate: 5.819E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.380777E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 13:50:01,299] [INFO] [logging.py:60:log_dist] [Rank 0] step=17350, skipped=16, lr=[0.000581873707680007, 0.000581873707680007], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17350 loss: 2.3748 iter time (s): 64.265 samples/sec: 15.934 %comms: 0.0028140158312855612 %optimizer_step 0.055008962147559 %forward: 22.662205392328428 %backward: 60.753435582274776 [2025-04-06 13:50:01,299] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34406.66 | forward: 145638.78 | backward_microstep: 390446.07 | backward: 390432.28 | backward_inner_microstep: 390414.69 | backward_inner: 390407.73 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.08 | reduce_grads: 0.21 | step: 353.52 | _step_clipping: 0.12 | _step_step: 351.75 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 15.934 | iteration 17350/ 143000 | elapsed time per iteration (ms): 64265.6 | learning rate: 5.819E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.381064E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 14:00:28,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=17360, skipped=16, lr=[0.0005818511386131494, 0.0005818511386131494], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17360 loss: 2.3588 iter time (s): 62.690 samples/sec: 16.334 %comms: 0.002898997634658138 %optimizer_step 0.056138538117101074 %forward: 23.20391300103002 %backward: 62.27637219300607 [2025-04-06 14:00:28,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18836.73 | forward: 145464.36 | backward_microstep: 390421.64 | backward: 390407.97 | backward_inner_microstep: 390390.77 | backward_inner: 390383.69 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.20 | step: 351.93 | _step_clipping: 0.13 | _step_step: 350.10 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 16.334 | iteration 17360/ 143000 | elapsed time per iteration (ms): 62690.1 | learning rate: 5.819E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.382116E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 14:11:18,668] [INFO] [logging.py:60:log_dist] [Rank 0] step=17370, skipped=16, lr=[0.0005818285559428918, 0.0005818285559428918], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17370 loss: 2.3835 iter time (s): 65.046 samples/sec: 15.743 %comms: 0.002796536263463998 %optimizer_step 0.05641235584626849 %forward: 22.433557219135995 %backward: 60.034011571095434 [2025-04-06 14:11:18,668] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41814.62 | forward: 145921.59 | backward_microstep: 390515.24 | backward: 390497.96 | backward_inner_microstep: 390479.61 | backward_inner: 390472.56 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.03 | reduce_tied_grads: 0.36 | comms: 18.19 | reduce_grads: 0.23 | step: 366.94 | _step_clipping: 0.14 | _step_step: 365.09 | _step_zero_grad: 0.54 | _step_check_overflow: 0.54 samples/sec: 15.743 | iteration 17370/ 143000 | elapsed time per iteration (ms): 65046.7 | learning rate: 5.818E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.377151E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 14:21:53,138] [INFO] [logging.py:60:log_dist] [Rank 0] step=17380, skipped=16, lr=[0.0005818059596703246, 0.0005818059596703246], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17380 loss: 2.3785 iter time (s): 63.446 samples/sec: 16.140 %comms: 0.0028734689950553494 %optimizer_step 0.05831631052752829 %forward: 22.930210130357654 %backward: 61.54958678973226 [2025-04-06 14:21:53,139] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26221.18 | forward: 145484.15 | backward_microstep: 390523.89 | backward: 390510.55 | backward_inner_microstep: 390493.28 | backward_inner: 390486.50 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.77 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.20 | step: 370.00 | _step_clipping: 0.14 | _step_step: 368.10 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.139 | iteration 17380/ 143000 | elapsed time per iteration (ms): 63447.1 | learning rate: 5.818E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.384531E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 14:32:29,003] [INFO] [logging.py:60:log_dist] [Rank 0] step=17390, skipped=16, lr=[0.0005817833497965379, 0.0005817833497965379], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17390 loss: 2.4156 iter time (s): 63.586 samples/sec: 16.104 %comms: 0.002861696953769956 %optimizer_step 0.05842071034783754 %forward: 22.887257669506443 %backward: 61.387808083756326 [2025-04-06 14:32:29,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27810.01 | forward: 145530.58 | backward_microstep: 390352.45 | backward: 390339.62 | backward_inner_microstep: 390321.54 | backward_inner: 390314.62 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.21 | step: 371.47 | _step_clipping: 0.14 | _step_step: 369.50 | _step_zero_grad: 0.53 | _step_check_overflow: 0.71 samples/sec: 16.104 | iteration 17390/ 143000 | elapsed time per iteration (ms): 63586.4 | learning rate: 5.818E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.403764E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 14:43:04,431] [INFO] [logging.py:60:log_dist] [Rank 0] step=17400, skipped=16, lr=[0.0005817607263226232, 0.0005817607263226232], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17400 loss: 2.3814 iter time (s): 63.542 samples/sec: 16.115 %comms: 0.002863473053035742 %optimizer_step 0.05698156037493541 %forward: 22.891746821468182 %backward: 61.42151233381509 [2025-04-06 14:43:04,433] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27465.90 | forward: 145459.31 | backward_microstep: 390297.87 | backward: 390286.11 | backward_inner_microstep: 390268.51 | backward_inner: 390261.75 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.85 | reduce_tied_grads: 0.35 | comms: 18.20 | reduce_grads: 0.22 | step: 362.07 | _step_clipping: 0.15 | _step_step: 360.22 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.115 | iteration 17400/ 143000 | elapsed time per iteration (ms): 63542.9 | learning rate: 5.818E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.385213E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 14:53:43,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=17410, skipped=16, lr=[0.0005817380892496724, 0.0005817380892496724], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17410 loss: 2.3780 iter time (s): 63.955 samples/sec: 16.011 %comms: 0.0028801316231111904 %optimizer_step 0.05553060631937591 %forward: 22.80329147994116 %backward: 61.01632634398702 [2025-04-06 14:53:43,983] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31295.46 | forward: 145837.35 | backward_microstep: 390238.78 | backward: 390226.97 | backward_inner_microstep: 390209.37 | backward_inner: 390202.59 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.89 | reduce_tied_grads: 0.35 | comms: 18.42 | reduce_grads: 0.21 | step: 355.14 | _step_clipping: 0.13 | _step_step: 353.27 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.011 | iteration 17410/ 143000 | elapsed time per iteration (ms): 63955.1 | learning rate: 5.817E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.386518E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 15:04:24,567] [INFO] [logging.py:60:log_dist] [Rank 0] step=17420, skipped=16, lr=[0.000581715438578778, 0.000581715438578778], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17420 loss: 2.3840 iter time (s): 64.058 samples/sec: 15.986 %comms: 0.002872436313324539 %optimizer_step 0.05672164733805713 %forward: 22.725754736633093 %backward: 60.93857880068493 [2025-04-06 15:04:24,568] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32437.57 | forward: 145576.16 | backward_microstep: 390372.53 | backward: 390359.07 | backward_inner_microstep: 390341.19 | backward_inner: 390334.24 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.96 | reduce_tied_grads: 0.34 | comms: 18.40 | reduce_grads: 0.20 | step: 363.35 | _step_clipping: 0.13 | _step_step: 361.32 | _step_zero_grad: 0.54 | _step_check_overflow: 0.74 samples/sec: 15.985 | iteration 17420/ 143000 | elapsed time per iteration (ms): 64058.4 | learning rate: 5.817E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.380166E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 15:15:04,721] [INFO] [logging.py:60:log_dist] [Rank 0] step=17430, skipped=16, lr=[0.0005816927743110334, 0.0005816927743110334], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17430 loss: 2.4085 iter time (s): 64.015 samples/sec: 15.996 %comms: 0.0028523146598269516 %optimizer_step 0.056579957775880756 %forward: 22.776947354853945 %backward: 61.03290055424901 [2025-04-06 15:15:04,722] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31402.43 | forward: 145806.28 | backward_microstep: 390714.75 | backward: 390701.17 | backward_inner_microstep: 390683.26 | backward_inner: 390676.33 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.91 | reduce_tied_grads: 0.37 | comms: 18.26 | reduce_grads: 0.20 | step: 362.20 | _step_clipping: 0.15 | _step_step: 360.20 | _step_zero_grad: 0.55 | _step_check_overflow: 0.69 samples/sec: 15.996 | iteration 17430/ 143000 | elapsed time per iteration (ms): 64015.4 | learning rate: 5.817E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.389232E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 15:25:35,107] [INFO] [logging.py:60:log_dist] [Rank 0] step=17440, skipped=16, lr=[0.0005816700964475324, 0.0005816700964475324], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17440 loss: 2.3846 iter time (s): 63.038 samples/sec: 16.244 %comms: 0.002906272921796603 %optimizer_step 0.05543117452700173 %forward: 23.073900005663113 %backward: 61.91624180365142 [2025-04-06 15:25:35,108] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22417.89 | forward: 145453.23 | backward_microstep: 390320.03 | backward: 390307.55 | backward_inner_microstep: 390290.29 | backward_inner: 390283.54 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.78 | reduce_tied_grads: 0.29 | comms: 18.32 | reduce_grads: 0.19 | step: 349.43 | _step_clipping: 0.12 | _step_step: 347.65 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 16.244 | iteration 17440/ 143000 | elapsed time per iteration (ms): 63038.6 | learning rate: 5.817E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.383839E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 15:36:20,246] [INFO] [logging.py:60:log_dist] [Rank 0] step=17450, skipped=16, lr=[0.0005816474049893694, 0.0005816474049893694], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17450 loss: 2.3890 iter time (s): 64.513 samples/sec: 15.873 %comms: 0.0028268788970402248 %optimizer_step 0.05804045219206003 %forward: 22.60028256430707 %backward: 60.4985279061035 [2025-04-06 15:36:20,247] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36816.97 | forward: 145801.76 | backward_microstep: 390309.34 | backward: 390295.65 | backward_inner_microstep: 390277.56 | backward_inner: 390270.63 | backward_allreduce_microstep: 8.78 | backward_allreduce: 2.93 | reduce_tied_grads: 0.36 | comms: 18.24 | reduce_grads: 0.22 | step: 374.44 | _step_clipping: 0.15 | _step_step: 372.45 | _step_zero_grad: 0.57 | _step_check_overflow: 0.61 samples/sec: 15.873 | iteration 17450/ 143000 | elapsed time per iteration (ms): 64513.9 | learning rate: 5.816E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.385378E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 15:46:57,224] [INFO] [logging.py:60:log_dist] [Rank 0] step=17460, skipped=16, lr=[0.0005816246999376397, 0.0005816246999376397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17460 loss: 2.3935 iter time (s): 63.697 samples/sec: 16.076 %comms: 0.0028344232715021497 %optimizer_step 0.056399243912745145 %forward: 22.87088666749479 %backward: 61.28414020553353 [2025-04-06 15:46:57,225] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28657.89 | forward: 145681.17 | backward_microstep: 390375.68 | backward: 390362.87 | backward_inner_microstep: 390345.26 | backward_inner: 390338.23 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.89 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.20 | step: 359.25 | _step_clipping: 0.13 | _step_step: 357.42 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.076 | iteration 17460/ 143000 | elapsed time per iteration (ms): 63697.8 | learning rate: 5.816E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.374558E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 15:57:31,029] [INFO] [logging.py:60:log_dist] [Rank 0] step=17470, skipped=16, lr=[0.0005816019812934391, 0.0005816019812934391], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17470 loss: 2.3864 iter time (s): 63.380 samples/sec: 16.157 %comms: 0.002827845157565003 %optimizer_step 0.054898618397516336 %forward: 22.991559569097113 %backward: 61.5667488648334 [2025-04-06 15:57:31,030] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25681.67 | forward: 145720.47 | backward_microstep: 390221.83 | backward: 390209.96 | backward_inner_microstep: 390192.86 | backward_inner: 390186.25 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.82 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 347.95 | _step_clipping: 0.13 | _step_step: 346.16 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.156 | iteration 17470/ 143000 | elapsed time per iteration (ms): 63380.5 | learning rate: 5.816E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.379557E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 16:08:01,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=17480, skipped=16, lr=[0.0005815792490578642, 0.0005815792490578642], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17480 loss: 2.3824 iter time (s): 63.095 samples/sec: 16.230 %comms: 0.0028590325921409834 %optimizer_step 0.05610563805482032 %forward: 23.07508654235089 %backward: 61.87356908131899 [2025-04-06 16:08:01,983] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22754.84 | forward: 145591.63 | backward_microstep: 390402.94 | backward: 390389.60 | backward_inner_microstep: 390372.13 | backward_inner: 390365.39 | backward_allreduce_microstep: 8.38 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.04 | reduce_grads: 0.21 | step: 354.00 | _step_clipping: 0.13 | _step_step: 352.14 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 16.229 | iteration 17480/ 143000 | elapsed time per iteration (ms): 63095.3 | learning rate: 5.816E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.385357E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 16:18:33,653] [INFO] [logging.py:60:log_dist] [Rank 0] step=17490, skipped=16, lr=[0.0005815565032320122, 0.0005815565032320122], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17490 loss: 2.3971 iter time (s): 63.166 samples/sec: 16.211 %comms: 0.002936336467167979 %optimizer_step 0.056898648051318355 %forward: 23.043640728630425 %backward: 61.805607423106004 [2025-04-06 16:18:33,653] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23476.40 | forward: 145558.33 | backward_microstep: 390417.64 | backward: 390403.62 | backward_inner_microstep: 390386.07 | backward_inner: 390379.17 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.55 | reduce_grads: 0.22 | step: 359.41 | _step_clipping: 0.13 | _step_step: 357.50 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.211 | iteration 17490/ 143000 | elapsed time per iteration (ms): 63167.0 | learning rate: 5.816E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.387472E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 16:29:10,072] [INFO] [logging.py:60:log_dist] [Rank 0] step=17500, skipped=16, lr=[0.0005815337438169806, 0.0005815337438169806], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17500 loss: 2.3706 iter time (s): 63.641 samples/sec: 16.090 %comms: 0.002861110133966417 %optimizer_step 0.056338042774235625 %forward: 22.91770417514307 %backward: 61.36019215244431 [2025-04-06 16:29:10,073] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27850.60 | forward: 145851.46 | backward_microstep: 390519.05 | backward: 390504.82 | backward_inner_microstep: 390485.22 | backward_inner: 390478.31 | backward_allreduce_microstep: 10.27 | backward_allreduce: 4.61 | reduce_tied_grads: 0.31 | comms: 18.21 | reduce_grads: 0.24 | step: 358.54 | _step_clipping: 0.16 | _step_step: 356.71 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.090 | iteration 17500/ 143000 | elapsed time per iteration (ms): 63642.0 | learning rate: 5.815E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.380653E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 16:39:47,390] [INFO] [logging.py:60:log_dist] [Rank 0] step=17510, skipped=16, lr=[0.0005815109708138682, 0.0005815109708138682], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17510 loss: 2.3896 iter time (s): 63.731 samples/sec: 16.067 %comms: 0.0028405072868356936 %optimizer_step 0.056714712712589786 %forward: 22.8448426845462 %backward: 61.25101549426151 [2025-04-06 16:39:47,391] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29152.16 | forward: 145592.85 | backward_microstep: 390372.12 | backward: 390359.87 | backward_inner_microstep: 390342.52 | backward_inner: 390335.72 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.82 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.22 | step: 361.45 | _step_clipping: 0.14 | _step_step: 359.55 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.067 | iteration 17510/ 143000 | elapsed time per iteration (ms): 63731.8 | learning rate: 5.815E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.378022E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 16:50:24,203] [INFO] [logging.py:60:log_dist] [Rank 0] step=17520, skipped=16, lr=[0.000581488184223774, 0.000581488184223774], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17520 loss: 2.3670 iter time (s): 63.681 samples/sec: 16.080 %comms: 0.00286304885888405 %optimizer_step 0.05613071103756447 %forward: 22.845378541100576 %backward: 61.31186394054411 [2025-04-06 16:50:24,204] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28665.65 | forward: 145481.06 | backward_microstep: 390449.57 | backward: 390438.49 | backward_inner_microstep: 390419.19 | backward_inner: 390412.37 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.85 | reduce_tied_grads: 0.31 | comms: 18.23 | reduce_grads: 0.20 | step: 357.44 | _step_clipping: 0.12 | _step_step: 355.64 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.080 | iteration 17520/ 143000 | elapsed time per iteration (ms): 63681.3 | learning rate: 5.815E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.378049E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 17:00:58,358] [INFO] [logging.py:60:log_dist] [Rank 0] step=17530, skipped=16, lr=[0.0005814653840477979, 0.0005814653840477979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17530 loss: 2.3944 iter time (s): 63.415 samples/sec: 16.148 %comms: 0.002838883715747564 %optimizer_step 0.05489310578490105 %forward: 22.96587913671073 %backward: 61.56443146068822 [2025-04-06 17:00:58,358] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25900.92 | forward: 145637.86 | backward_microstep: 390420.13 | backward: 390410.14 | backward_inner_microstep: 390392.90 | backward_inner: 390384.38 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.86 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.20 | step: 348.10 | _step_clipping: 0.14 | _step_step: 346.40 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.147 | iteration 17530/ 143000 | elapsed time per iteration (ms): 63415.4 | learning rate: 5.815E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.380423E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 17:11:34,358] [INFO] [logging.py:60:log_dist] [Rank 0] step=17540, skipped=16, lr=[0.0005814425702870401, 0.0005814425702870401], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17540 loss: 2.3821 iter time (s): 63.599 samples/sec: 16.101 %comms: 0.0028475509183977753 %optimizer_step 0.05923666906629188 %forward: 22.91166651133094 %backward: 61.422507556787586 [2025-04-06 17:11:34,359] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27399.39 | forward: 145717.00 | backward_microstep: 390656.77 | backward: 390643.94 | backward_inner_microstep: 390626.31 | backward_inner: 390619.47 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.11 | reduce_grads: 0.21 | step: 376.74 | _step_clipping: 0.13 | _step_step: 374.81 | _step_zero_grad: 0.55 | _step_check_overflow: 0.63 samples/sec: 16.101 | iteration 17540/ 143000 | elapsed time per iteration (ms): 63600.0 | learning rate: 5.814E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.384622E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 17:22:12,773] [INFO] [logging.py:60:log_dist] [Rank 0] step=17550, skipped=16, lr=[0.0005814197429426019, 0.0005814197429426019], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17550 loss: 2.3909 iter time (s): 63.841 samples/sec: 16.040 %comms: 0.0028636698444651837 %optimizer_step 0.05851122079377418 %forward: 22.831891706176467 %backward: 61.193912567347276 [2025-04-06 17:22:12,774] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29676.40 | forward: 145760.94 | backward_microstep: 390679.23 | backward: 390667.67 | backward_inner_microstep: 390648.55 | backward_inner: 390641.84 | backward_allreduce_microstep: 10.06 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.28 | reduce_grads: 0.21 | step: 373.54 | _step_clipping: 0.15 | _step_step: 371.70 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.040 | iteration 17550/ 143000 | elapsed time per iteration (ms): 63841.5 | learning rate: 5.814E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.383302E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 17:32:45,102] [INFO] [logging.py:60:log_dist] [Rank 0] step=17560, skipped=16, lr=[0.0005813969020155849, 0.0005813969020155849], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17560 loss: 2.3782 iter time (s): 63.232 samples/sec: 16.194 %comms: 0.002926489752938831 %optimizer_step 0.05698538972114377 %forward: 23.08036251413241 %backward: 61.828012958372206 [2025-04-06 17:32:45,103] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23131.63 | forward: 145942.36 | backward_microstep: 390969.69 | backward: 390952.54 | backward_inner_microstep: 390934.88 | backward_inner: 390927.77 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.84 | reduce_tied_grads: 0.32 | comms: 18.50 | reduce_grads: 0.20 | step: 360.33 | _step_clipping: 0.12 | _step_step: 358.33 | _step_zero_grad: 0.52 | _step_check_overflow: 0.73 samples/sec: 16.194 | iteration 17560/ 143000 | elapsed time per iteration (ms): 63232.9 | learning rate: 5.814E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.386511E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 17:43:21,055] [INFO] [logging.py:60:log_dist] [Rank 0] step=17570, skipped=16, lr=[0.0005813740475070916, 0.0005813740475070916], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17570 loss: 2.3760 iter time (s): 63.594 samples/sec: 16.102 %comms: 0.0028514889649722086 %optimizer_step 0.0572196686170474 %forward: 22.895820941204267 %backward: 61.412234672956735 [2025-04-06 17:43:21,055] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27575.18 | forward: 145604.65 | backward_microstep: 390558.94 | backward: 390547.54 | backward_inner_microstep: 390530.19 | backward_inner: 390523.31 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.84 | reduce_tied_grads: 0.37 | comms: 18.13 | reduce_grads: 0.22 | step: 363.89 | _step_clipping: 0.16 | _step_step: 361.86 | _step_zero_grad: 0.54 | _step_check_overflow: 0.68 samples/sec: 16.102 | iteration 17570/ 143000 | elapsed time per iteration (ms): 63595.2 | learning rate: 5.814E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.385316E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 17:53:57,419] [INFO] [logging.py:60:log_dist] [Rank 0] step=17580, skipped=16, lr=[0.0005813511794182252, 0.0005813511794182252], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17580 loss: 2.4028 iter time (s): 63.636 samples/sec: 16.092 %comms: 0.0028796070344348505 %optimizer_step 0.05772773050716484 %forward: 22.885545980882043 %backward: 61.377954613044075 [2025-04-06 17:53:57,420] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27889.19 | forward: 145634.03 | backward_microstep: 390596.96 | backward: 390583.61 | backward_inner_microstep: 390565.89 | backward_inner: 390559.08 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.89 | reduce_tied_grads: 0.36 | comms: 18.32 | reduce_grads: 0.23 | step: 367.36 | _step_clipping: 0.15 | _step_step: 365.16 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 16.091 | iteration 17580/ 143000 | elapsed time per iteration (ms): 63636.4 | learning rate: 5.814E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.388016E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 18:04:30,874] [INFO] [logging.py:60:log_dist] [Rank 0] step=17590, skipped=16, lr=[0.0005813282977500891, 0.0005813282977500891], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17590 loss: 2.3657 iter time (s): 63.345 samples/sec: 16.165 %comms: 0.0028906126121265104 %optimizer_step 0.05867040286175526 %forward: 22.995654004210525 %backward: 61.669237902341436 [2025-04-06 18:04:30,875] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24869.06 | forward: 145665.66 | backward_microstep: 390657.31 | backward: 390642.96 | backward_inner_microstep: 390624.92 | backward_inner: 390617.85 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.93 | reduce_tied_grads: 0.36 | comms: 18.31 | reduce_grads: 0.22 | step: 371.65 | _step_clipping: 0.14 | _step_step: 369.67 | _step_zero_grad: 0.55 | _step_check_overflow: 0.64 samples/sec: 16.165 | iteration 17590/ 143000 | elapsed time per iteration (ms): 63345.5 | learning rate: 5.813E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.381038E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 18:15:10,003] [INFO] [logging.py:60:log_dist] [Rank 0] step=17600, skipped=16, lr=[0.0005813054025037878, 0.0005813054025037878], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17600 loss: 2.3912 iter time (s): 63.912 samples/sec: 16.022 %comms: 0.002896173142958631 %optimizer_step 0.0567660679581147 %forward: 22.787819172282422 %backward: 61.105125699879316 [2025-04-06 18:15:10,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30662.82 | forward: 145642.15 | backward_microstep: 390550.72 | backward: 390536.80 | backward_inner_microstep: 390519.12 | backward_inner: 390512.16 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 18.51 | reduce_grads: 0.21 | step: 362.80 | _step_clipping: 0.15 | _step_step: 360.89 | _step_zero_grad: 0.50 | _step_check_overflow: 0.64 samples/sec: 16.022 | iteration 17600/ 143000 | elapsed time per iteration (ms): 63912.9 | learning rate: 5.813E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.380216E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 18:25:57,195] [INFO] [logging.py:60:log_dist] [Rank 0] step=17610, skipped=16, lr=[0.0005812824936804265, 0.0005812824936804265], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17610 loss: 2.3792 iter time (s): 64.719 samples/sec: 15.822 %comms: 0.0028808329421396127 %optimizer_step 0.054953251425262716 %forward: 22.536140227008744 %backward: 60.352440446401886 [2025-04-06 18:25:57,195] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38484.02 | forward: 145850.63 | backward_microstep: 390611.69 | backward: 390592.24 | backward_inner_microstep: 390574.78 | backward_inner: 390567.67 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 18.64 | reduce_grads: 0.24 | step: 355.65 | _step_clipping: 0.13 | _step_step: 353.73 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 15.822 | iteration 17610/ 143000 | elapsed time per iteration (ms): 64719.1 | learning rate: 5.813E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.384994E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 18:36:25,458] [INFO] [logging.py:60:log_dist] [Rank 0] step=17620, skipped=16, lr=[0.0005812595712811107, 0.0005812595712811107], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17620 loss: 2.3868 iter time (s): 62.826 samples/sec: 16.299 %comms: 0.0029582104345443317 %optimizer_step 0.06001851946293177 %forward: 23.174098022675327 %backward: 62.14106394951253 [2025-04-06 18:36:25,459] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20010.76 | forward: 145593.21 | backward_microstep: 390420.72 | backward: 390406.44 | backward_inner_microstep: 390388.67 | backward_inner: 390381.56 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.90 | reduce_tied_grads: 0.34 | comms: 18.59 | reduce_grads: 0.22 | step: 377.07 | _step_clipping: 0.13 | _step_step: 374.99 | _step_zero_grad: 0.52 | _step_check_overflow: 0.80 samples/sec: 16.299 | iteration 17620/ 143000 | elapsed time per iteration (ms): 62826.4 | learning rate: 5.813E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.389765E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 18:47:00,212] [INFO] [logging.py:60:log_dist] [Rank 0] step=17630, skipped=16, lr=[0.0005812366353069468, 0.0005812366353069468], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17630 loss: 2.3829 iter time (s): 63.475 samples/sec: 16.132 %comms: 0.002896306569231627 %optimizer_step 0.057015535213525634 %forward: 22.936556670346473 %backward: 61.507465935112016 [2025-04-06 18:47:00,213] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26498.83 | forward: 145589.10 | backward_microstep: 390431.18 | backward: 390416.78 | backward_inner_microstep: 390398.45 | backward_inner: 390391.44 | backward_allreduce_microstep: 8.84 | backward_allreduce: 2.99 | reduce_tied_grads: 0.36 | comms: 18.38 | reduce_grads: 0.22 | step: 361.90 | _step_clipping: 0.14 | _step_step: 359.95 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 16.132 | iteration 17630/ 143000 | elapsed time per iteration (ms): 63475.3 | learning rate: 5.812E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.387419E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 18:57:32,271] [INFO] [logging.py:60:log_dist] [Rank 0] step=17640, skipped=16, lr=[0.0005812136857590418, 0.0005812136857590418], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17640 loss: 2.3968 iter time (s): 63.205 samples/sec: 16.201 %comms: 0.0028580268409017983 %optimizer_step 0.05659385030239106 %forward: 23.02008120642288 %backward: 61.763742550113044 [2025-04-06 18:57:32,272] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23946.92 | forward: 145499.24 | backward_microstep: 390393.44 | backward: 390379.94 | backward_inner_microstep: 390362.36 | backward_inner: 390355.43 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.83 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.21 | step: 357.70 | _step_clipping: 0.12 | _step_step: 356.03 | _step_zero_grad: 0.48 | _step_check_overflow: 0.49 samples/sec: 16.201 | iteration 17640/ 143000 | elapsed time per iteration (ms): 63205.9 | learning rate: 5.812E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.383415E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 19:08:12,206] [INFO] [logging.py:60:log_dist] [Rank 0] step=17650, skipped=16, lr=[0.0005811907226385033, 0.0005811907226385033], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17650 loss: 2.3778 iter time (s): 63.993 samples/sec: 16.002 %comms: 0.0028518771402311787 %optimizer_step 0.05887498534638828 %forward: 22.75850523857726 %backward: 61.00082067602537 [2025-04-06 19:08:12,207] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31704.56 | forward: 145638.27 | backward_microstep: 390378.82 | backward: 390361.94 | backward_inner_microstep: 390343.94 | backward_inner: 390336.94 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.21 | step: 376.76 | _step_clipping: 0.14 | _step_step: 374.91 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.002 | iteration 17650/ 143000 | elapsed time per iteration (ms): 63993.5 | learning rate: 5.812E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.381302E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 19:18:56,371] [INFO] [logging.py:60:log_dist] [Rank 0] step=17660, skipped=16, lr=[0.0005811677459464395, 0.0005811677459464395], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17660 loss: 2.3655 iter time (s): 64.416 samples/sec: 15.897 %comms: 0.00287634369425732 %optimizer_step 0.05972870382386545 %forward: 22.61555044938822 %backward: 60.59185850534262 [2025-04-06 19:18:56,372] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35940.65 | forward: 145680.07 | backward_microstep: 390320.42 | backward: 390307.83 | backward_inner_microstep: 390289.66 | backward_inner: 390282.73 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.99 | reduce_tied_grads: 0.36 | comms: 18.53 | reduce_grads: 0.23 | step: 384.75 | _step_clipping: 0.14 | _step_step: 382.77 | _step_zero_grad: 0.59 | _step_check_overflow: 0.58 samples/sec: 15.897 | iteration 17660/ 143000 | elapsed time per iteration (ms): 64416.5 | learning rate: 5.812E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.385619E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 19:29:39,388] [INFO] [logging.py:60:log_dist] [Rank 0] step=17670, skipped=16, lr=[0.0005811447556839597, 0.0005811447556839597], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17670 loss: 2.3861 iter time (s): 64.301 samples/sec: 15.925 %comms: 0.0028016488183833094 %optimizer_step 0.0561882239007865 %forward: 22.63571621343163 %backward: 60.67674825360327 [2025-04-06 19:29:39,388] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35139.09 | forward: 145550.13 | backward_microstep: 390168.12 | backward: 390158.12 | backward_inner_microstep: 390140.27 | backward_inner: 390133.51 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.98 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.21 | step: 361.30 | _step_clipping: 0.13 | _step_step: 359.49 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 15.925 | iteration 17670/ 143000 | elapsed time per iteration (ms): 64301.7 | learning rate: 5.811E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.384929E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 19:40:20,363] [INFO] [logging.py:60:log_dist] [Rank 0] step=17680, skipped=16, lr=[0.0005811217518521732, 0.0005811217518521732], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17680 loss: 2.3895 iter time (s): 64.097 samples/sec: 15.976 %comms: 0.00287778875233888 %optimizer_step 0.05587560497500091 %forward: 22.716862576177082 %backward: 60.9056620179949 [2025-04-06 19:40:20,363] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32719.16 | forward: 145608.02 | backward_microstep: 390399.58 | backward: 390386.34 | backward_inner_microstep: 390368.84 | backward_inner: 390361.99 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.45 | reduce_grads: 0.20 | step: 358.15 | _step_clipping: 0.13 | _step_step: 356.34 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 15.976 | iteration 17680/ 143000 | elapsed time per iteration (ms): 64097.5 | learning rate: 5.811E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.380700E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 19:50:56,734] [INFO] [logging.py:60:log_dist] [Rank 0] step=17690, skipped=16, lr=[0.0005810987344521903, 0.0005810987344521903], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17690 loss: 2.3949 iter time (s): 63.636 samples/sec: 16.091 %comms: 0.0028451853041902325 %optimizer_step 0.05631024885755831 %forward: 22.87408517247907 %backward: 61.32765759184522 [2025-04-06 19:50:56,735] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28339.12 | forward: 145562.53 | backward_microstep: 390279.20 | backward: 390267.36 | backward_inner_microstep: 390250.15 | backward_inner: 390243.49 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.81 | reduce_tied_grads: 0.35 | comms: 18.11 | reduce_grads: 0.22 | step: 358.34 | _step_clipping: 0.15 | _step_step: 356.35 | _step_zero_grad: 0.55 | _step_check_overflow: 0.65 samples/sec: 16.091 | iteration 17690/ 143000 | elapsed time per iteration (ms): 63637.1 | learning rate: 5.811E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.381260E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 20:01:30,204] [INFO] [logging.py:60:log_dist] [Rank 0] step=17700, skipped=16, lr=[0.0005810757034851221, 0.0005810757034851221], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17700 loss: 2.3900 iter time (s): 63.346 samples/sec: 16.165 %comms: 0.002862807421026884 %optimizer_step 0.055444852427256865 %forward: 22.99582141108985 %backward: 61.63843151917015 [2025-04-06 20:01:30,204] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25079.56 | forward: 145670.07 | backward_microstep: 390471.40 | backward: 390456.80 | backward_inner_microstep: 390439.10 | backward_inner: 390431.99 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.87 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.22 | step: 351.22 | _step_clipping: 0.13 | _step_step: 349.29 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.165 | iteration 17700/ 143000 | elapsed time per iteration (ms): 63347.0 | learning rate: 5.811E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.386184E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 20:12:15,299] [INFO] [logging.py:60:log_dist] [Rank 0] step=17710, skipped=16, lr=[0.00058105265895208, 0.00058105265895208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17710 loss: 2.3725 iter time (s): 64.509 samples/sec: 15.874 %comms: 0.0028303988776337136 %optimizer_step 0.057025541033903615 %forward: 22.611176083766217 %backward: 60.54430258706843 [2025-04-06 20:12:15,300] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36393.69 | forward: 145862.05 | backward_microstep: 390577.72 | backward: 390564.20 | backward_inner_microstep: 390546.18 | backward_inner: 390539.20 | backward_allreduce_microstep: 8.56 | backward_allreduce: 3.02 | reduce_tied_grads: 0.34 | comms: 18.26 | reduce_grads: 0.21 | step: 367.87 | _step_clipping: 0.12 | _step_step: 365.98 | _step_zero_grad: 0.59 | _step_check_overflow: 0.54 samples/sec: 15.874 | iteration 17710/ 143000 | elapsed time per iteration (ms): 64509.6 | learning rate: 5.811E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.379899E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 20:22:51,575] [INFO] [logging.py:60:log_dist] [Rank 0] step=17720, skipped=16, lr=[0.0005810296008541763, 0.0005810296008541763], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17720 loss: 2.3770 iter time (s): 63.627 samples/sec: 16.094 %comms: 0.002832531374324378 %optimizer_step 0.05697741479355025 %forward: 22.88176804803767 %backward: 61.366170350853 [2025-04-06 20:22:51,576] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27976.15 | forward: 145589.74 | backward_microstep: 390470.54 | backward: 390454.31 | backward_inner_microstep: 390436.71 | backward_inner: 390429.84 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.81 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 362.53 | _step_clipping: 0.12 | _step_step: 360.87 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.094 | iteration 17720/ 143000 | elapsed time per iteration (ms): 63627.6 | learning rate: 5.810E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.378459E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 20:33:29,722] [INFO] [logging.py:60:log_dist] [Rank 0] step=17730, skipped=16, lr=[0.0005810065291925239, 0.0005810065291925239], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17730 loss: 2.3851 iter time (s): 63.814 samples/sec: 16.047 %comms: 0.0028459309754627015 %optimizer_step 0.06056389793355063 %forward: 22.816445579311008 %backward: 61.15010606350568 [2025-04-06 20:33:29,723] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30096.33 | forward: 145601.14 | backward_microstep: 390234.93 | backward: 390224.03 | backward_inner_microstep: 390206.70 | backward_inner: 390200.11 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.22 | step: 386.48 | _step_clipping: 0.13 | _step_step: 384.65 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 16.046 | iteration 17730/ 143000 | elapsed time per iteration (ms): 63814.7 | learning rate: 5.810E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.378602E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 20:38:50,625] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-06 20:39:52,970] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-06 20:44:04,714] [INFO] [logging.py:60:log_dist] [Rank 0] step=17740, skipped=18, lr=[0.0005809880620980511, 0.0005809880620980511], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17740 loss: 2.4189 iter time (s): 63.499 samples/sec: 16.126 %comms: 0.0022941649908522514 %optimizer_step 0.048956266631766894 %forward: 22.95388748268347 %backward: 61.48624021408211 [2025-04-06 20:44:04,714] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26556.51 | forward: 145753.84 | backward_microstep: 390440.24 | backward: 390428.67 | backward_inner_microstep: 390411.26 | backward_inner: 390404.71 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.82 | reduce_tied_grads: 0.38 | comms: 14.57 | reduce_grads: 0.22 | step: 310.87 | _step_clipping: 0.14 | _step_step: 309.09 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.126 | iteration 17740/ 143000 | elapsed time per iteration (ms): 63499.1 | learning rate: 5.810E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.401761E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-06 20:54:37,829] [INFO] [logging.py:60:log_dist] [Rank 0] step=17750, skipped=18, lr=[0.0005809649660244577, 0.0005809649660244577], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17750 loss: 2.4101 iter time (s): 63.311 samples/sec: 16.174 %comms: 0.0028378943583655465 %optimizer_step 0.055089999574163336 %forward: 22.980421917047547 %backward: 61.673173633175374 [2025-04-06 20:54:37,830] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24925.29 | forward: 145491.29 | backward_microstep: 390470.09 | backward: 390458.87 | backward_inner_microstep: 390441.89 | backward_inner: 390435.23 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.75 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.19 | step: 348.78 | _step_clipping: 0.12 | _step_step: 346.93 | _step_zero_grad: 0.46 | _step_check_overflow: 0.52 samples/sec: 16.174 | iteration 17750/ 143000 | elapsed time per iteration (ms): 63311.6 | learning rate: 5.810E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.404599E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 21:05:11,005] [INFO] [logging.py:60:log_dist] [Rank 0] step=17760, skipped=18, lr=[0.000580941856390235, 0.000580941856390235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17760 loss: 2.3676 iter time (s): 63.317 samples/sec: 16.173 %comms: 0.003169360989448918 %optimizer_step 0.056881913283505156 %forward: 23.01939013652589 %backward: 61.66171493345569 [2025-04-06 21:05:11,006] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24719.39 | forward: 145751.95 | backward_microstep: 390437.97 | backward: 390423.68 | backward_inner_microstep: 390406.51 | backward_inner: 390399.68 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.86 | reduce_tied_grads: 0.34 | comms: 20.07 | reduce_grads: 0.21 | step: 360.16 | _step_clipping: 0.14 | _step_step: 358.34 | _step_zero_grad: 0.55 | _step_check_overflow: 0.51 samples/sec: 16.172 | iteration 17760/ 143000 | elapsed time per iteration (ms): 63317.7 | learning rate: 5.809E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.388143E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 21:15:46,039] [INFO] [logging.py:60:log_dist] [Rank 0] step=17770, skipped=18, lr=[0.0005809187331964986, 0.0005809187331964986], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17770 loss: 2.4049 iter time (s): 63.503 samples/sec: 16.125 %comms: 0.002919840774718751 %optimizer_step 0.056948196020905194 %forward: 22.934995127539644 %backward: 61.49725628659637 [2025-04-06 21:15:46,040] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26584.88 | forward: 145643.69 | backward_microstep: 390537.59 | backward: 390524.93 | backward_inner_microstep: 390507.01 | backward_inner: 390500.11 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.93 | reduce_tied_grads: 0.35 | comms: 18.54 | reduce_grads: 0.24 | step: 361.64 | _step_clipping: 0.15 | _step_step: 359.53 | _step_zero_grad: 0.54 | _step_check_overflow: 0.75 samples/sec: 16.125 | iteration 17770/ 143000 | elapsed time per iteration (ms): 63503.4 | learning rate: 5.809E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.417231E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 21:26:24,613] [INFO] [logging.py:60:log_dist] [Rank 0] step=17780, skipped=18, lr=[0.0005808955964443644, 0.0005808955964443644], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17780 loss: 2.3698 iter time (s): 63.857 samples/sec: 16.036 %comms: 0.002870391615971066 %optimizer_step 0.05932054309992812 %forward: 22.826069982625246 %backward: 61.17500944950027 [2025-04-06 21:26:24,614] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29848.48 | forward: 145759.82 | backward_microstep: 390658.63 | backward: 390643.60 | backward_inner_microstep: 390625.82 | backward_inner: 390618.73 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.87 | reduce_tied_grads: 0.37 | comms: 18.33 | reduce_grads: 0.22 | step: 378.80 | _step_clipping: 0.15 | _step_step: 376.81 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.036 | iteration 17780/ 143000 | elapsed time per iteration (ms): 63857.4 | learning rate: 5.809E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.384108E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 21:36:55,192] [INFO] [logging.py:60:log_dist] [Rank 0] step=17790, skipped=18, lr=[0.0005808724461349492, 0.0005808724461349492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17790 loss: 2.3705 iter time (s): 63.057 samples/sec: 16.239 %comms: 0.0029274278163299553 %optimizer_step 0.05709363320909989 %forward: 23.11624199898342 %backward: 61.97513733608384 [2025-04-06 21:36:55,193] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21629.37 | forward: 145764.69 | backward_microstep: 390814.91 | backward: 390798.25 | backward_inner_microstep: 390778.54 | backward_inner: 390771.34 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 18.46 | reduce_grads: 0.24 | step: 360.02 | _step_clipping: 0.12 | _step_step: 358.09 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 16.239 | iteration 17790/ 143000 | elapsed time per iteration (ms): 63057.9 | learning rate: 5.809E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.380447E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 21:47:24,152] [INFO] [logging.py:60:log_dist] [Rank 0] step=17800, skipped=18, lr=[0.0005808492822693702, 0.0005808492822693702], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17800 loss: 2.3848 iter time (s): 62.895 samples/sec: 16.281 %comms: 0.003016846729448572 %optimizer_step 0.05793812730259543 %forward: 23.16824361833885 %backward: 62.14429033445439 [2025-04-06 21:47:24,152] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20012.64 | forward: 145717.32 | backward_microstep: 390874.54 | backward: 390858.27 | backward_inner_microstep: 390840.50 | backward_inner: 390833.37 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.87 | reduce_tied_grads: 0.36 | comms: 18.97 | reduce_grads: 0.21 | step: 364.40 | _step_clipping: 0.13 | _step_step: 362.42 | _step_zero_grad: 0.57 | _step_check_overflow: 0.61 samples/sec: 16.281 | iteration 17800/ 143000 | elapsed time per iteration (ms): 62895.9 | learning rate: 5.808E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.384089E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 21:58:05,116] [INFO] [logging.py:60:log_dist] [Rank 0] step=17810, skipped=18, lr=[0.0005808261048487456, 0.0005808261048487456], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17810 loss: 2.3905 iter time (s): 64.096 samples/sec: 15.976 %comms: 0.0028690943287700567 %optimizer_step 0.05788267246126215 %forward: 22.767814956750247 %backward: 60.94233544112463 [2025-04-06 21:58:05,116] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32094.09 | forward: 145932.23 | backward_microstep: 390628.85 | backward: 390615.03 | backward_inner_microstep: 390597.03 | backward_inner: 390589.86 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.93 | reduce_tied_grads: 0.38 | comms: 18.39 | reduce_grads: 0.24 | step: 371.00 | _step_clipping: 0.13 | _step_step: 369.13 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 15.976 | iteration 17810/ 143000 | elapsed time per iteration (ms): 64096.4 | learning rate: 5.808E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.383630E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 22:08:37,752] [INFO] [logging.py:60:log_dist] [Rank 0] step=17820, skipped=18, lr=[0.0005808029138741937, 0.0005808029138741937], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17820 loss: 2.3795 iter time (s): 63.263 samples/sec: 16.186 %comms: 0.002902492890452655 %optimizer_step 0.058110006072442044 %forward: 23.041932491464472 %backward: 61.7547857418008 [2025-04-06 22:08:37,753] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23859.97 | forward: 145770.21 | backward_microstep: 390694.73 | backward: 390679.40 | backward_inner_microstep: 390661.16 | backward_inner: 390653.95 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.89 | reduce_tied_grads: 0.33 | comms: 18.36 | reduce_grads: 0.22 | step: 367.62 | _step_clipping: 0.14 | _step_step: 365.62 | _step_zero_grad: 0.67 | _step_check_overflow: 0.54 samples/sec: 16.186 | iteration 17820/ 143000 | elapsed time per iteration (ms): 63263.7 | learning rate: 5.808E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.381909E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 22:19:17,091] [INFO] [logging.py:60:log_dist] [Rank 0] step=17830, skipped=18, lr=[0.0005807797093468342, 0.0005807797093468342], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17830 loss: 2.3953 iter time (s): 63.933 samples/sec: 16.017 %comms: 0.002869863920440727 %optimizer_step 0.05678229957762325 %forward: 22.75095211931819 %backward: 61.037311894510715 [2025-04-06 22:19:17,092] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31404.35 | forward: 145454.28 | backward_microstep: 390241.88 | backward: 390231.50 | backward_inner_microstep: 390211.78 | backward_inner: 390204.89 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.35 | reduce_grads: 0.23 | step: 363.03 | _step_clipping: 0.14 | _step_step: 361.22 | _step_zero_grad: 0.55 | _step_check_overflow: 0.49 samples/sec: 16.017 | iteration 17830/ 143000 | elapsed time per iteration (ms): 63933.9 | learning rate: 5.808E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.383752E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 22:29:47,622] [INFO] [logging.py:60:log_dist] [Rank 0] step=17840, skipped=18, lr=[0.0005807564912677868, 0.0005807564912677868], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17840 loss: 2.4013 iter time (s): 63.052 samples/sec: 16.240 %comms: 0.0029528344488191595 %optimizer_step 0.0574685162076281 %forward: 23.12162536446941 %backward: 61.974631646465795 [2025-04-06 22:29:47,623] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21627.31 | forward: 145787.52 | backward_microstep: 390782.04 | backward: 390765.25 | backward_inner_microstep: 390745.81 | backward_inner: 390738.45 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.82 | reduce_tied_grads: 0.33 | comms: 18.62 | reduce_grads: 0.20 | step: 362.35 | _step_clipping: 0.13 | _step_step: 360.25 | _step_zero_grad: 0.60 | _step_check_overflow: 0.70 samples/sec: 16.240 | iteration 17840/ 143000 | elapsed time per iteration (ms): 63053.1 | learning rate: 5.808E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.385581E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 22:40:20,318] [INFO] [logging.py:60:log_dist] [Rank 0] step=17850, skipped=18, lr=[0.0005807332596381722, 0.0005807332596381722], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17850 loss: 2.3927 iter time (s): 63.269 samples/sec: 16.185 %comms: 0.0029229808577251903 %optimizer_step 0.05839107122414421 %forward: 23.00457887530785 %backward: 61.70139375397356 [2025-04-06 22:40:20,319] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24465.75 | forward: 145547.72 | backward_microstep: 390393.78 | backward: 390378.68 | backward_inner_microstep: 390360.88 | backward_inner: 390353.88 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.49 | reduce_grads: 0.22 | step: 369.43 | _step_clipping: 0.13 | _step_step: 367.54 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.185 | iteration 17850/ 143000 | elapsed time per iteration (ms): 63269.7 | learning rate: 5.807E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.380222E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 22:51:00,046] [INFO] [logging.py:60:log_dist] [Rank 0] step=17860, skipped=18, lr=[0.0005807100144591117, 0.0005807100144591117], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17860 loss: 2.3736 iter time (s): 63.972 samples/sec: 16.007 %comms: 0.00283171184964741 %optimizer_step 0.055699675182711836 %forward: 22.79803480524417 %backward: 61.04359375557715 [2025-04-06 22:51:00,046] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31060.56 | forward: 145843.72 | backward_microstep: 390523.13 | backward: 390508.44 | backward_inner_microstep: 390488.79 | backward_inner: 390481.77 | backward_allreduce_microstep: 10.15 | backward_allreduce: 4.61 | reduce_tied_grads: 0.30 | comms: 18.12 | reduce_grads: 0.20 | step: 356.32 | _step_clipping: 0.13 | _step_step: 354.61 | _step_zero_grad: 0.50 | _step_check_overflow: 0.48 samples/sec: 16.007 | iteration 17860/ 143000 | elapsed time per iteration (ms): 63972.7 | learning rate: 5.807E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.378107E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 23:01:31,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=17870, skipped=18, lr=[0.000580686755731727, 0.000580686755731727], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17870 loss: 2.3830 iter time (s): 63.188 samples/sec: 16.206 %comms: 0.002904792005442807 %optimizer_step 0.056843533888755786 %forward: 23.03530441664999 %backward: 61.79274454986343 [2025-04-06 23:01:31,935] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23552.43 | forward: 145556.22 | backward_microstep: 390474.06 | backward: 390457.97 | backward_inner_microstep: 390439.92 | backward_inner: 390432.77 | backward_allreduce_microstep: 8.63 | backward_allreduce: 3.05 | reduce_tied_grads: 0.32 | comms: 18.35 | reduce_grads: 0.21 | step: 359.18 | _step_clipping: 0.14 | _step_step: 357.12 | _step_zero_grad: 0.56 | _step_check_overflow: 0.72 samples/sec: 16.205 | iteration 17870/ 143000 | elapsed time per iteration (ms): 63188.9 | learning rate: 5.807E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.374631E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 23:12:00,896] [INFO] [logging.py:60:log_dist] [Rank 0] step=17880, skipped=18, lr=[0.000580663483457141, 0.000580663483457141], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17880 loss: 2.3723 iter time (s): 62.896 samples/sec: 16.281 %comms: 0.002909520060249999 %optimizer_step 0.05802099446203861 %forward: 23.147498543634605 %backward: 62.10147216294336 [2025-04-06 23:12:00,897] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20464.46 | forward: 145587.41 | backward_microstep: 390607.89 | backward: 390590.48 | backward_inner_microstep: 390572.38 | backward_inner: 390565.13 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.30 | reduce_grads: 0.25 | step: 364.93 | _step_clipping: 0.12 | _step_step: 363.00 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 16.281 | iteration 17880/ 143000 | elapsed time per iteration (ms): 62896.1 | learning rate: 5.807E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.377773E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 23:22:45,689] [INFO] [logging.py:60:log_dist] [Rank 0] step=17890, skipped=18, lr=[0.0005806401976364767, 0.0005806401976364767], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17890 loss: 2.3858 iter time (s): 64.479 samples/sec: 15.881 %comms: 0.002829167131352367 %optimizer_step 0.055295754140803795 %forward: 22.603413689633463 %backward: 60.56647889923731 [2025-04-06 23:22:45,690] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36196.99 | forward: 145744.02 | backward_microstep: 390542.47 | backward: 390525.19 | backward_inner_microstep: 390507.42 | backward_inner: 390500.28 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.24 | reduce_grads: 0.20 | step: 356.54 | _step_clipping: 0.12 | _step_step: 354.75 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 15.881 | iteration 17890/ 143000 | elapsed time per iteration (ms): 64479.3 | learning rate: 5.806E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.377311E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 23:33:22,364] [INFO] [logging.py:60:log_dist] [Rank 0] step=17900, skipped=18, lr=[0.000580616898270858, 0.000580616898270858], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17900 loss: 2.3772 iter time (s): 63.667 samples/sec: 16.084 %comms: 0.002866669521040814 %optimizer_step 0.05716232869239843 %forward: 22.85631952253562 %backward: 61.29471464923084 [2025-04-06 23:33:22,365] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28632.59 | forward: 145518.98 | backward_microstep: 390255.69 | backward: 390244.11 | backward_inner_microstep: 390224.91 | backward_inner: 390216.40 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.83 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.20 | step: 363.93 | _step_clipping: 0.13 | _step_step: 361.98 | _step_zero_grad: 0.55 | _step_check_overflow: 0.65 samples/sec: 16.084 | iteration 17900/ 143000 | elapsed time per iteration (ms): 63667.5 | learning rate: 5.806E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.381936E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 23:43:55,234] [INFO] [logging.py:60:log_dist] [Rank 0] step=17910, skipped=18, lr=[0.0005805935853614095, 0.0005805935853614095], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17910 loss: 2.3815 iter time (s): 63.286 samples/sec: 16.180 %comms: 0.002885745936007389 %optimizer_step 0.05780808385983115 %forward: 23.01187942706108 %backward: 61.69733905427518 [2025-04-06 23:43:55,235] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24462.47 | forward: 145634.03 | backward_microstep: 390475.01 | backward: 390460.59 | backward_inner_microstep: 390439.33 | backward_inner: 390432.41 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.32 | comms: 18.26 | reduce_grads: 0.21 | step: 365.85 | _step_clipping: 0.14 | _step_step: 363.84 | _step_zero_grad: 0.58 | _step_check_overflow: 0.64 samples/sec: 16.180 | iteration 17910/ 143000 | elapsed time per iteration (ms): 63287.1 | learning rate: 5.806E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.380816E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-06 23:54:29,043] [INFO] [logging.py:60:log_dist] [Rank 0] step=17920, skipped=18, lr=[0.0005805702589092563, 0.0005805702589092563], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17920 loss: 2.3889 iter time (s): 63.380 samples/sec: 16.156 %comms: 0.002853976064326554 %optimizer_step 0.055910643375304894 %forward: 22.952963144589184 %backward: 61.5549245916169 [2025-04-06 23:54:29,044] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25907.77 | forward: 145476.51 | backward_microstep: 390147.81 | backward: 390136.81 | backward_inner_microstep: 390119.44 | backward_inner: 390112.90 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.44 | step: 354.36 | _step_clipping: 0.12 | _step_step: 352.49 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 16.156 | iteration 17920/ 143000 | elapsed time per iteration (ms): 63380.9 | learning rate: 5.806E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.390675E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 00:05:07,177] [INFO] [logging.py:60:log_dist] [Rank 0] step=17930, skipped=18, lr=[0.0005805469189155243, 0.0005805469189155243], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17930 loss: 2.3890 iter time (s): 63.813 samples/sec: 16.047 %comms: 0.0029469446932641336 %optimizer_step 0.05711505734925492 %forward: 22.81624260177988 %backward: 61.185062732380416 [2025-04-07 00:05:07,178] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29775.16 | forward: 145596.73 | backward_microstep: 390452.67 | backward: 390438.73 | backward_inner_microstep: 390421.25 | backward_inner: 390414.18 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.81 | reduce_grads: 0.20 | step: 364.47 | _step_clipping: 0.13 | _step_step: 362.60 | _step_zero_grad: 0.57 | _step_check_overflow: 0.53 samples/sec: 16.047 | iteration 17930/ 143000 | elapsed time per iteration (ms): 63813.4 | learning rate: 5.805E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.390249E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 00:15:43,066] [INFO] [logging.py:60:log_dist] [Rank 0] step=17940, skipped=18, lr=[0.0005805235653813402, 0.0005805235653813402], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17940 loss: 2.3973 iter time (s): 63.588 samples/sec: 16.104 %comms: 0.002921278757720601 %optimizer_step 0.057169859845351646 %forward: 22.918225942118735 %backward: 61.4198897804633 [2025-04-07 00:15:43,067] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27247.77 | forward: 145733.04 | backward_microstep: 390574.08 | backward: 390558.46 | backward_inner_microstep: 390540.07 | backward_inner: 390532.84 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.58 | reduce_grads: 0.22 | step: 363.53 | _step_clipping: 0.14 | _step_step: 361.55 | _step_zero_grad: 0.54 | _step_check_overflow: 0.64 samples/sec: 16.103 | iteration 17940/ 143000 | elapsed time per iteration (ms): 63588.9 | learning rate: 5.805E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.383569E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 00:26:17,789] [INFO] [logging.py:60:log_dist] [Rank 0] step=17950, skipped=18, lr=[0.0005805001983078306, 0.0005805001983078306], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17950 loss: 2.3976 iter time (s): 63.472 samples/sec: 16.133 %comms: 0.0028636880703437964 %optimizer_step 0.057296036247513865 %forward: 22.926805127213804 %backward: 61.4864720112918 [2025-04-07 00:26:17,790] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26633.88 | forward: 145520.35 | backward_microstep: 390275.88 | backward: 390265.14 | backward_inner_microstep: 390247.28 | backward_inner: 390238.95 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.95 | reduce_tied_grads: 0.34 | comms: 18.18 | reduce_grads: 0.20 | step: 363.67 | _step_clipping: 0.14 | _step_step: 361.58 | _step_zero_grad: 0.57 | _step_check_overflow: 0.76 samples/sec: 16.133 | iteration 17950/ 143000 | elapsed time per iteration (ms): 63472.3 | learning rate: 5.805E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.384012E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 00:36:52,796] [INFO] [logging.py:60:log_dist] [Rank 0] step=17960, skipped=18, lr=[0.0005804768176961238, 0.0005804768176961238], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17960 loss: 2.3708 iter time (s): 63.500 samples/sec: 16.126 %comms: 0.002909194179505899 %optimizer_step 0.05708561895213639 %forward: 22.947676642522154 %backward: 61.497115479134756 [2025-04-07 00:36:52,797] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26405.03 | forward: 145717.78 | backward_microstep: 390524.00 | backward: 390506.76 | backward_inner_microstep: 390489.14 | backward_inner: 390482.25 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.87 | reduce_tied_grads: 0.34 | comms: 18.47 | reduce_grads: 0.21 | step: 362.49 | _step_clipping: 0.13 | _step_step: 360.49 | _step_zero_grad: 0.53 | _step_check_overflow: 0.71 samples/sec: 16.126 | iteration 17960/ 143000 | elapsed time per iteration (ms): 63500.6 | learning rate: 5.805E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.383481E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 00:47:27,301] [INFO] [logging.py:60:log_dist] [Rank 0] step=17970, skipped=18, lr=[0.000580453423547348, 0.000580453423547348], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17970 loss: 2.3956 iter time (s): 63.450 samples/sec: 16.139 %comms: 0.002910819704835269 %optimizer_step 0.05861710385378746 %forward: 22.96858197122753 %backward: 61.55886951943697 [2025-04-07 00:47:27,301] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25823.04 | forward: 145735.21 | backward_microstep: 390607.84 | backward: 390589.85 | backward_inner_microstep: 390571.67 | backward_inner: 390564.42 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.92 | reduce_tied_grads: 0.34 | comms: 18.47 | reduce_grads: 0.24 | step: 371.92 | _step_clipping: 0.15 | _step_step: 369.87 | _step_zero_grad: 0.59 | _step_check_overflow: 0.63 samples/sec: 16.139 | iteration 17970/ 143000 | elapsed time per iteration (ms): 63450.5 | learning rate: 5.805E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.391038E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 00:58:03,357] [INFO] [logging.py:60:log_dist] [Rank 0] step=17980, skipped=18, lr=[0.0005804300158626324, 0.0005804300158626324], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17980 loss: 2.3599 iter time (s): 63.605 samples/sec: 16.099 %comms: 0.002997914225298854 %optimizer_step 0.05894426095421554 %forward: 22.895289884029715 %backward: 61.37027579215919 [2025-04-07 00:58:03,357] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27751.71 | forward: 145625.55 | backward_microstep: 390358.78 | backward: 390345.80 | backward_inner_microstep: 390328.43 | backward_inner: 390321.60 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.79 | reduce_tied_grads: 0.29 | comms: 19.07 | reduce_grads: 0.19 | step: 374.92 | _step_clipping: 0.12 | _step_step: 373.06 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.099 | iteration 17980/ 143000 | elapsed time per iteration (ms): 63605.6 | learning rate: 5.804E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.390744E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 01:08:42,616] [INFO] [logging.py:60:log_dist] [Rank 0] step=17990, skipped=18, lr=[0.0005804065946431069, 0.0005804065946431069], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 17990 loss: 2.4011 iter time (s): 63.925 samples/sec: 16.019 %comms: 0.0030283189112784298 %optimizer_step 0.05933354555385683 %forward: 22.807028628089526 %backward: 61.09696520210641 [2025-04-07 01:08:42,617] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30544.98 | forward: 145794.73 | backward_microstep: 390582.18 | backward: 390564.49 | backward_inner_microstep: 390543.09 | backward_inner: 390535.97 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.88 | reduce_tied_grads: 0.36 | comms: 19.36 | reduce_grads: 0.22 | step: 379.29 | _step_clipping: 0.14 | _step_step: 377.34 | _step_zero_grad: 0.53 | _step_check_overflow: 0.63 samples/sec: 16.019 | iteration 17990/ 143000 | elapsed time per iteration (ms): 63926.0 | learning rate: 5.804E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.383475E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 01:19:13,096] [INFO] [logging.py:60:log_dist] [Rank 0] step=18000, skipped=18, lr=[0.0005803831598899016, 0.0005803831598899016], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18000 loss: 2.3967 iter time (s): 63.047 samples/sec: 16.242 %comms: 0.0028909055026971743 %optimizer_step 0.05716378303112808 %forward: 23.083452628737618 %backward: 61.91846673557142 [2025-04-07 01:19:13,096] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22224.28 | forward: 145534.99 | backward_microstep: 390392.21 | backward: 390379.35 | backward_inner_microstep: 390356.68 | backward_inner: 390349.54 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.84 | reduce_tied_grads: 0.32 | comms: 18.23 | reduce_grads: 0.19 | step: 360.40 | _step_clipping: 0.14 | _step_step: 358.52 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 16.242 | iteration 18000/ 143000 | elapsed time per iteration (ms): 63047.9 | learning rate: 5.804E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.388488E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 01:19:16,106] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step18000/mp_rank_00_model_states.pt [2025-04-07 01:19:30,370] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-07 01:19:30,376] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step18000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-07 01:30:07,163] [INFO] [logging.py:60:log_dist] [Rank 0] step=18010, skipped=18, lr=[0.0005803597116041477, 0.0005803597116041477], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18010 loss: 2.3821 iter time (s): 63.677 samples/sec: 16.081 %comms: 0.0028674010046829766 %optimizer_step 0.05821824108462152 %forward: 22.856573698788825 %backward: 61.29479473596291 [2025-04-07 01:30:07,164] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28600.46 | forward: 145544.29 | backward_microstep: 390321.75 | backward: 390308.16 | backward_inner_microstep: 390290.19 | backward_inner: 390283.07 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.26 | reduce_grads: 0.23 | step: 370.72 | _step_clipping: 0.14 | _step_step: 368.63 | _step_zero_grad: 0.61 | _step_check_overflow: 0.67 samples/sec: 15.656 | iteration 18010/ 143000 | elapsed time per iteration (ms): 65406.8 | learning rate: 5.804E-04 | approx flops per GPU: 67.5TFLOPS | lm_loss: 2.381510E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 01:40:38,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=18020, skipped=18, lr=[0.0005803362497869771, 0.0005803362497869771], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18020 loss: 2.3982 iter time (s): 63.098 samples/sec: 16.229 %comms: 0.0029111902057061623 %optimizer_step 0.056433825498707416 %forward: 23.09635910278624 %backward: 61.90525489325712 [2025-04-07 01:40:38,148] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22238.00 | forward: 145732.86 | backward_microstep: 390625.45 | backward: 390608.32 | backward_inner_microstep: 390588.22 | backward_inner: 390578.69 | backward_allreduce_microstep: 10.37 | backward_allreduce: 3.02 | reduce_tied_grads: 0.32 | comms: 18.37 | reduce_grads: 0.21 | step: 356.08 | _step_clipping: 0.12 | _step_step: 354.15 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 16.229 | iteration 18020/ 143000 | elapsed time per iteration (ms): 63098.4 | learning rate: 5.803E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.387071E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 01:51:11,989] [INFO] [logging.py:60:log_dist] [Rank 0] step=18030, skipped=18, lr=[0.0005803127744395219, 0.0005803127744395219], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18030 loss: 2.3913 iter time (s): 63.384 samples/sec: 16.156 %comms: 0.0028746249059673 %optimizer_step 0.05631834124776754 %forward: 22.961419830737324 %backward: 61.552908582545825 [2025-04-07 01:51:11,990] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25869.13 | forward: 145537.88 | backward_microstep: 390155.85 | backward: 390144.86 | backward_inner_microstep: 390127.39 | backward_inner: 390120.69 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 18.22 | reduce_grads: 0.21 | step: 356.97 | _step_clipping: 0.13 | _step_step: 355.25 | _step_zero_grad: 0.49 | _step_check_overflow: 0.51 samples/sec: 16.155 | iteration 18030/ 143000 | elapsed time per iteration (ms): 63384.2 | learning rate: 5.803E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.385126E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 02:01:47,504] [INFO] [logging.py:60:log_dist] [Rank 0] step=18040, skipped=18, lr=[0.0005802892855629153, 0.0005802892855629153], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18040 loss: 2.3901 iter time (s): 63.551 samples/sec: 16.113 %comms: 0.0028689369532845393 %optimizer_step 0.0554724671691758 %forward: 22.92802126771566 %backward: 61.454201579646096 [2025-04-07 02:01:47,504] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26888.63 | forward: 145709.58 | backward_microstep: 390563.04 | backward: 390546.83 | backward_inner_microstep: 390525.87 | backward_inner: 390518.72 | backward_allreduce_microstep: 11.65 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.23 | reduce_grads: 0.20 | step: 352.53 | _step_clipping: 0.12 | _step_step: 350.74 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.113 | iteration 18040/ 143000 | elapsed time per iteration (ms): 63551.4 | learning rate: 5.803E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.389563E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 02:12:25,378] [INFO] [logging.py:60:log_dist] [Rank 0] step=18050, skipped=18, lr=[0.0005802657831582909, 0.0005802657831582909], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18050 loss: 2.3856 iter time (s): 63.787 samples/sec: 16.053 %comms: 0.002876972876880424 %optimizer_step 0.05693345837879721 %forward: 22.836231366172964 %backward: 61.22702311980982 [2025-04-07 02:12:25,379] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29287.02 | forward: 145665.23 | backward_microstep: 390564.79 | backward: 390548.16 | backward_inner_microstep: 390530.38 | backward_inner: 390523.20 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.20 | step: 363.16 | _step_clipping: 0.13 | _step_step: 361.17 | _step_zero_grad: 0.54 | _step_check_overflow: 0.72 samples/sec: 16.053 | iteration 18050/ 143000 | elapsed time per iteration (ms): 63787.5 | learning rate: 5.803E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.380395E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 02:22:55,414] [INFO] [logging.py:60:log_dist] [Rank 0] step=18060, skipped=18, lr=[0.000580242267226783, 0.000580242267226783], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18060 loss: 2.3632 iter time (s): 63.003 samples/sec: 16.253 %comms: 0.002868418590712422 %optimizer_step 0.056719374085640956 %forward: 23.12938872864559 %backward: 62.00504371290522 [2025-04-07 02:22:55,415] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21287.52 | forward: 145722.03 | backward_microstep: 390664.11 | backward: 390650.21 | backward_inner_microstep: 390632.51 | backward_inner: 390625.48 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.85 | reduce_tied_grads: 0.33 | comms: 18.07 | reduce_grads: 0.19 | step: 357.35 | _step_clipping: 0.13 | _step_step: 355.38 | _step_zero_grad: 0.52 | _step_check_overflow: 0.70 samples/sec: 16.253 | iteration 18060/ 143000 | elapsed time per iteration (ms): 63003.6 | learning rate: 5.802E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.381458E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 02:33:29,101] [INFO] [logging.py:60:log_dist] [Rank 0] step=18070, skipped=18, lr=[0.0005802187377695268, 0.0005802187377695268], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18070 loss: 2.3842 iter time (s): 63.368 samples/sec: 16.160 %comms: 0.0028564839276978855 %optimizer_step 0.05584171925662293 %forward: 22.977407533558896 %backward: 61.61989674859382 [2025-04-07 02:33:29,101] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25205.38 | forward: 145603.31 | backward_microstep: 390486.60 | backward: 390473.17 | backward_inner_microstep: 390454.10 | backward_inner: 390447.18 | backward_allreduce_microstep: 9.82 | backward_allreduce: 2.83 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.20 | step: 353.86 | _step_clipping: 0.13 | _step_step: 351.96 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 16.159 | iteration 18070/ 143000 | elapsed time per iteration (ms): 63368.7 | learning rate: 5.802E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.378073E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 02:44:05,707] [INFO] [logging.py:60:log_dist] [Rank 0] step=18080, skipped=18, lr=[0.0005801951947876576, 0.0005801951947876576], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18080 loss: 2.3697 iter time (s): 63.660 samples/sec: 16.085 %comms: 0.002861397990199698 %optimizer_step 0.0565103819200089 %forward: 22.871223796149703 %backward: 61.337076338922834 [2025-04-07 02:44:05,708] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28180.01 | forward: 145598.18 | backward_microstep: 390485.00 | backward: 390471.76 | backward_inner_microstep: 390454.13 | backward_inner: 390447.33 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.21 | step: 359.75 | _step_clipping: 0.12 | _step_step: 357.82 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 16.085 | iteration 18080/ 143000 | elapsed time per iteration (ms): 63660.6 | learning rate: 5.802E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.376923E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 02:54:40,808] [INFO] [logging.py:60:log_dist] [Rank 0] step=18090, skipped=18, lr=[0.000580171638282312, 0.000580171638282312], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18090 loss: 2.3949 iter time (s): 63.509 samples/sec: 16.124 %comms: 0.0028658131844862874 %optimizer_step 0.058289128787198365 %forward: 22.92007439273969 %backward: 61.46199805543356 [2025-04-07 02:54:40,809] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26869.21 | forward: 145564.24 | backward_microstep: 390354.44 | backward: 390342.07 | backward_inner_microstep: 390324.70 | backward_inner: 390317.97 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.86 | reduce_tied_grads: 0.35 | comms: 18.20 | reduce_grads: 0.21 | step: 370.19 | _step_clipping: 0.14 | _step_step: 368.37 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.123 | iteration 18090/ 143000 | elapsed time per iteration (ms): 63510.1 | learning rate: 5.802E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.381324E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 03:05:22,347] [INFO] [logging.py:60:log_dist] [Rank 0] step=18100, skipped=18, lr=[0.0005801480682546267, 0.0005801480682546267], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18100 loss: 2.3761 iter time (s): 64.153 samples/sec: 15.962 %comms: 0.0028674574045626337 %optimizer_step 0.05669721841810782 %forward: 22.69634950905485 %backward: 60.8485045164028 [2025-04-07 03:05:22,347] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33212.67 | forward: 145604.39 | backward_microstep: 390376.53 | backward: 390362.74 | backward_inner_microstep: 390344.97 | backward_inner: 390337.90 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.40 | reduce_grads: 0.22 | step: 363.73 | _step_clipping: 0.13 | _step_step: 361.88 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 15.962 | iteration 18100/ 143000 | elapsed time per iteration (ms): 64153.8 | learning rate: 5.801E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.376015E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 03:16:01,394] [INFO] [logging.py:60:log_dist] [Rank 0] step=18110, skipped=18, lr=[0.0005801244847057394, 0.0005801244847057394], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18110 loss: 2.3722 iter time (s): 63.904 samples/sec: 16.024 %comms: 0.0028334127874710168 %optimizer_step 0.05532883442500139 %forward: 22.79679860999667 %backward: 61.07973082146484 [2025-04-07 03:16:01,395] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30729.18 | forward: 145681.13 | backward_microstep: 390339.17 | backward: 390325.16 | backward_inner_microstep: 390307.02 | backward_inner: 390300.11 | backward_allreduce_microstep: 8.63 | backward_allreduce: 3.00 | reduce_tied_grads: 0.36 | comms: 18.11 | reduce_grads: 0.22 | step: 353.57 | _step_clipping: 0.13 | _step_step: 351.70 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.024 | iteration 18110/ 143000 | elapsed time per iteration (ms): 63904.8 | learning rate: 5.801E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.379861E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 03:26:38,103] [INFO] [logging.py:60:log_dist] [Rank 0] step=18120, skipped=18, lr=[0.0005801008876367885, 0.0005801008876367885], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18120 loss: 2.3820 iter time (s): 63.670 samples/sec: 16.083 %comms: 0.002874156607146314 %optimizer_step 0.057836546123230456 %forward: 22.881703639498845 %backward: 61.321839830388335 [2025-04-07 03:26:38,103] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28223.01 | forward: 145688.31 | backward_microstep: 390452.80 | backward: 390437.50 | backward_inner_microstep: 390420.13 | backward_inner: 390413.06 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.76 | reduce_tied_grads: 0.36 | comms: 18.30 | reduce_grads: 0.22 | step: 368.25 | _step_clipping: 0.13 | _step_step: 366.28 | _step_zero_grad: 0.56 | _step_check_overflow: 0.62 samples/sec: 16.083 | iteration 18120/ 143000 | elapsed time per iteration (ms): 63670.8 | learning rate: 5.801E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.376134E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 03:37:14,284] [INFO] [logging.py:60:log_dist] [Rank 0] step=18130, skipped=18, lr=[0.0005800772770489126, 0.0005800772770489126], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18130 loss: 2.3788 iter time (s): 63.617 samples/sec: 16.096 %comms: 0.0028853461664156595 %optimizer_step 0.06019317803705694 %forward: 22.889624318908446 %backward: 61.383207863879775 [2025-04-07 03:37:14,284] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27675.98 | forward: 145618.03 | backward_microstep: 390519.99 | backward: 390504.53 | backward_inner_microstep: 390486.40 | backward_inner: 390479.20 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.91 | reduce_tied_grads: 0.34 | comms: 18.36 | reduce_grads: 0.22 | step: 382.93 | _step_clipping: 0.14 | _step_step: 380.85 | _step_zero_grad: 0.59 | _step_check_overflow: 0.68 samples/sec: 16.096 | iteration 18130/ 143000 | elapsed time per iteration (ms): 63618.1 | learning rate: 5.801E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.372345E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 03:47:53,533] [INFO] [logging.py:60:log_dist] [Rank 0] step=18140, skipped=18, lr=[0.0005800536529432514, 0.0005800536529432514], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18140 loss: 2.3687 iter time (s): 63.924 samples/sec: 16.019 %comms: 0.002870492110430821 %optimizer_step 0.0572543508477237 %forward: 22.76474719126032 %backward: 61.05177238823105 [2025-04-07 03:47:53,533] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31143.30 | forward: 145521.97 | backward_microstep: 390281.27 | backward: 390268.95 | backward_inner_microstep: 390250.97 | backward_inner: 390244.08 | backward_allreduce_microstep: 8.66 | backward_allreduce: 3.05 | reduce_tied_grads: 0.37 | comms: 18.35 | reduce_grads: 0.23 | step: 365.99 | _step_clipping: 0.16 | _step_step: 364.09 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.019 | iteration 18140/ 143000 | elapsed time per iteration (ms): 63924.9 | learning rate: 5.801E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.377544E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 03:58:28,259] [INFO] [logging.py:60:log_dist] [Rank 0] step=18150, skipped=18, lr=[0.0005800300153209452, 0.0005800300153209452], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18150 loss: 2.4211 iter time (s): 63.472 samples/sec: 16.133 %comms: 0.002863897833428215 %optimizer_step 0.056639502364893415 %forward: 22.931628981504165 %backward: 61.48703091662284 [2025-04-07 03:58:28,260] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26578.64 | forward: 145551.76 | backward_microstep: 390281.33 | backward: 390270.81 | backward_inner_microstep: 390253.84 | backward_inner: 390247.18 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.79 | reduce_tied_grads: 0.34 | comms: 18.18 | reduce_grads: 0.22 | step: 359.50 | _step_clipping: 0.14 | _step_step: 357.67 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.133 | iteration 18150/ 143000 | elapsed time per iteration (ms): 63472.6 | learning rate: 5.800E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.376150E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 04:09:07,182] [INFO] [logging.py:60:log_dist] [Rank 0] step=18160, skipped=18, lr=[0.0005800063641831348, 0.0005800063641831348], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18160 loss: 2.3820 iter time (s): 63.892 samples/sec: 16.027 %comms: 0.002880613643882951 %optimizer_step 0.05687733297590016 %forward: 22.80760618056358 %backward: 61.10612552518653 [2025-04-07 04:09:07,182] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30437.86 | forward: 145721.61 | backward_microstep: 390432.41 | backward: 390417.25 | backward_inner_microstep: 390397.76 | backward_inner: 390390.67 | backward_allreduce_microstep: 10.16 | backward_allreduce: 2.91 | reduce_tied_grads: 0.34 | comms: 18.40 | reduce_grads: 0.21 | step: 363.40 | _step_clipping: 0.14 | _step_step: 361.47 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 16.027 | iteration 18160/ 143000 | elapsed time per iteration (ms): 63892.3 | learning rate: 5.800E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.396124E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 04:19:42,178] [INFO] [logging.py:60:log_dist] [Rank 0] step=18170, skipped=18, lr=[0.0005799826995309616, 0.0005799826995309616], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18170 loss: 2.3917 iter time (s): 63.499 samples/sec: 16.126 %comms: 0.0028955352278821553 %optimizer_step 0.056309220285814775 %forward: 22.915788895676695 %backward: 61.46544488692022 [2025-04-07 04:19:42,179] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26856.20 | forward: 145513.01 | backward_microstep: 390311.72 | backward: 390299.53 | backward_inner_microstep: 390282.26 | backward_inner: 390275.55 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.39 | reduce_grads: 0.22 | step: 357.56 | _step_clipping: 0.12 | _step_step: 355.67 | _step_zero_grad: 0.52 | _step_check_overflow: 0.63 samples/sec: 16.126 | iteration 18170/ 143000 | elapsed time per iteration (ms): 63499.6 | learning rate: 5.800E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.385719E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 04:30:27,451] [INFO] [logging.py:60:log_dist] [Rank 0] step=18180, skipped=18, lr=[0.0005799590213655679, 0.0005799590213655679], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18180 loss: 2.3920 iter time (s): 64.526 samples/sec: 15.869 %comms: 0.0027955999496641994 %optimizer_step 0.05626135199580941 %forward: 22.553970830396576 %backward: 60.47713035409211 [2025-04-07 04:30:27,452] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37163.35 | forward: 145532.56 | backward_microstep: 390249.17 | backward: 390236.89 | backward_inner_microstep: 390218.69 | backward_inner: 390211.85 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.04 | reduce_grads: 0.22 | step: 363.03 | _step_clipping: 0.12 | _step_step: 360.88 | _step_zero_grad: 0.75 | _step_check_overflow: 0.66 samples/sec: 15.869 | iteration 18180/ 143000 | elapsed time per iteration (ms): 64527.4 | learning rate: 5.800E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.388243E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 04:41:09,463] [INFO] [logging.py:60:log_dist] [Rank 0] step=18190, skipped=18, lr=[0.0005799353296880963, 0.0005799353296880963], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18190 loss: 2.3806 iter time (s): 64.201 samples/sec: 15.950 %comms: 0.0028044397325047345 %optimizer_step 0.055370611194887134 %forward: 22.664895457938588 %backward: 60.78435161058149 [2025-04-07 04:41:09,464] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33927.66 | forward: 145509.86 | backward_microstep: 390250.07 | backward: 390238.85 | backward_inner_microstep: 390219.32 | backward_inner: 390212.38 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.95 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.20 | step: 355.48 | _step_clipping: 0.14 | _step_step: 353.74 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 15.950 | iteration 18190/ 143000 | elapsed time per iteration (ms): 64201.2 | learning rate: 5.799E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.391266E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 04:51:44,254] [INFO] [logging.py:60:log_dist] [Rank 0] step=18200, skipped=18, lr=[0.0005799116244996906, 0.0005799116244996906], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18200 loss: 2.3569 iter time (s): 63.479 samples/sec: 16.131 %comms: 0.002911005966174052 %optimizer_step 0.056558211162829865 %forward: 22.948767833785276 %backward: 61.49339396332465 [2025-04-07 04:51:44,255] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26415.15 | forward: 145675.36 | backward_microstep: 390363.32 | backward: 390350.90 | backward_inner_microstep: 390333.17 | backward_inner: 390326.08 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.48 | reduce_grads: 0.22 | step: 359.02 | _step_clipping: 0.15 | _step_step: 357.21 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.131 | iteration 18200/ 143000 | elapsed time per iteration (ms): 63479.1 | learning rate: 5.799E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.385690E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 05:02:16,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=18210, skipped=18, lr=[0.0005798879058014945, 0.0005798879058014945], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18210 loss: 2.3698 iter time (s): 63.251 samples/sec: 16.190 %comms: 0.0029053594856622996 %optimizer_step 0.056561273250150874 %forward: 23.035447157354692 %backward: 61.74406369887495 [2025-04-07 05:02:16,768] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23914.76 | forward: 145700.71 | backward_microstep: 390550.06 | backward: 390535.24 | backward_inner_microstep: 390517.75 | backward_inner: 390510.55 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.82 | reduce_tied_grads: 0.32 | comms: 18.38 | reduce_grads: 0.20 | step: 357.75 | _step_clipping: 0.13 | _step_step: 355.89 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 16.189 | iteration 18210/ 143000 | elapsed time per iteration (ms): 63251.3 | learning rate: 5.799E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.370533E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 05:12:51,236] [INFO] [logging.py:60:log_dist] [Rank 0] step=18220, skipped=18, lr=[0.0005798641735946532, 0.0005798641735946532], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18220 loss: 2.3819 iter time (s): 63.446 samples/sec: 16.140 %comms: 0.0028780275120460753 %optimizer_step 0.05638589804999018 %forward: 22.93040860482577 %backward: 61.519249313563684 [2025-04-07 05:12:51,236] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26303.05 | forward: 145484.82 | backward_microstep: 390328.77 | backward: 390316.50 | backward_inner_microstep: 390299.13 | backward_inner: 390292.23 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 18.26 | reduce_grads: 0.20 | step: 357.75 | _step_clipping: 0.14 | _step_step: 356.00 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.139 | iteration 18220/ 143000 | elapsed time per iteration (ms): 63446.9 | learning rate: 5.799E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.383826E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 05:23:17,481] [INFO] [logging.py:60:log_dist] [Rank 0] step=18230, skipped=18, lr=[0.000579840427880312, 0.000579840427880312], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18230 loss: 2.3712 iter time (s): 62.624 samples/sec: 16.352 %comms: 0.0028938898102471976 %optimizer_step 0.05746940337230539 %forward: 23.220418257048117 %backward: 62.32527992743624 [2025-04-07 05:23:17,482] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18198.18 | forward: 145415.37 | backward_microstep: 390317.13 | backward: 390305.35 | backward_inner_microstep: 390287.85 | backward_inner: 390281.11 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.90 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.22 | step: 359.90 | _step_clipping: 0.13 | _step_step: 357.97 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.351 | iteration 18230/ 143000 | elapsed time per iteration (ms): 62624.5 | learning rate: 5.798E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.375871E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 05:33:57,118] [INFO] [logging.py:60:log_dist] [Rank 0] step=18240, skipped=18, lr=[0.0005798166686596168, 0.0005798166686596168], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18240 loss: 2.3846 iter time (s): 63.963 samples/sec: 16.009 %comms: 0.0028283455071356454 %optimizer_step 0.05614995851126258 %forward: 22.747619571100962 %backward: 61.013409595359704 [2025-04-07 05:33:57,118] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31534.03 | forward: 145500.74 | backward_microstep: 390271.48 | backward: 390260.44 | backward_inner_microstep: 390241.45 | backward_inner: 390234.87 | backward_allreduce_microstep: 10.01 | backward_allreduce: 3.00 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.21 | step: 359.15 | _step_clipping: 0.14 | _step_step: 357.43 | _step_zero_grad: 0.51 | _step_check_overflow: 0.47 samples/sec: 16.009 | iteration 18240/ 143000 | elapsed time per iteration (ms): 63963.7 | learning rate: 5.798E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.380340E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 05:44:31,952] [INFO] [logging.py:60:log_dist] [Rank 0] step=18250, skipped=18, lr=[0.0005797928959337145, 0.0005797928959337145], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18250 loss: 2.3626 iter time (s): 63.483 samples/sec: 16.130 %comms: 0.0028533836511230723 %optimizer_step 0.056735336601370735 %forward: 22.94500199344579 %backward: 61.49689236433975 [2025-04-07 05:44:31,953] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26398.50 | forward: 145661.39 | backward_microstep: 390413.77 | backward: 390399.74 | backward_inner_microstep: 390381.67 | backward_inner: 390374.49 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.83 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.21 | step: 360.17 | _step_clipping: 0.12 | _step_step: 358.34 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.130 | iteration 18250/ 143000 | elapsed time per iteration (ms): 63483.4 | learning rate: 5.798E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.375217E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 05:55:13,787] [INFO] [logging.py:60:log_dist] [Rank 0] step=18260, skipped=18, lr=[0.0005797691097037522, 0.0005797691097037522], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18260 loss: 2.3666 iter time (s): 64.183 samples/sec: 15.954 %comms: 0.0028365655617960312 %optimizer_step 0.055778681964011416 %forward: 22.721188698688035 %backward: 60.82509468991948 [2025-04-07 05:55:13,787] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33208.22 | forward: 145831.02 | backward_microstep: 390405.30 | backward: 390392.68 | backward_inner_microstep: 390372.00 | backward_inner: 390365.19 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 358.00 | _step_clipping: 0.13 | _step_step: 356.17 | _step_zero_grad: 0.54 | _step_check_overflow: 0.54 samples/sec: 15.954 | iteration 18260/ 143000 | elapsed time per iteration (ms): 64183.4 | learning rate: 5.798E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.382824E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 06:05:58,327] [INFO] [logging.py:60:log_dist] [Rank 0] step=18270, skipped=18, lr=[0.0005797453099708784, 0.0005797453099708784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18270 loss: 2.3701 iter time (s): 64.453 samples/sec: 15.887 %comms: 0.002824803324136667 %optimizer_step 0.05739645982671042 %forward: 22.62097811533738 %backward: 60.610779725658304 [2025-04-07 06:05:58,328] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35626.51 | forward: 145800.03 | backward_microstep: 390670.20 | backward: 390657.45 | backward_inner_microstep: 390636.82 | backward_inner: 390629.85 | backward_allreduce_microstep: 11.50 | backward_allreduce: 6.23 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 369.94 | _step_clipping: 0.13 | _step_step: 367.76 | _step_zero_grad: 0.56 | _step_check_overflow: 0.87 samples/sec: 15.887 | iteration 18270/ 143000 | elapsed time per iteration (ms): 64454.1 | learning rate: 5.797E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.381501E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 06:17:01,171] [INFO] [logging.py:60:log_dist] [Rank 0] step=18280, skipped=18, lr=[0.0005797214967362414, 0.0005797214967362414], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18280 loss: 2.3833 iter time (s): 66.284 samples/sec: 15.449 %comms: 0.002762488661228768 %optimizer_step 0.056659307516675354 %forward: 22.024698349985712 %backward: 58.90877551324871 [2025-04-07 06:17:01,171] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 53978.44 | forward: 145987.76 | backward_microstep: 390479.89 | backward: 390468.91 | backward_inner_microstep: 390451.82 | backward_inner: 390445.22 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.77 | reduce_tied_grads: 0.33 | comms: 18.31 | reduce_grads: 0.21 | step: 375.56 | _step_clipping: 0.13 | _step_step: 373.64 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 15.449 | iteration 18280/ 143000 | elapsed time per iteration (ms): 66284.3 | learning rate: 5.797E-04 | approx flops per GPU: 66.6TFLOPS | lm_loss: 2.382434E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 06:27:37,789] [INFO] [logging.py:60:log_dist] [Rank 0] step=18290, skipped=18, lr=[0.0005796976700009907, 0.0005796976700009907], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18290 loss: 2.3778 iter time (s): 63.661 samples/sec: 16.085 %comms: 0.002859844458736734 %optimizer_step 0.05777037858303509 %forward: 22.887971722958156 %backward: 61.33533705157106 [2025-04-07 06:27:37,790] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28039.13 | forward: 145707.63 | backward_microstep: 390481.11 | backward: 390468.26 | backward_inner_microstep: 390450.36 | backward_inner: 390441.90 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.98 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.21 | step: 367.77 | _step_clipping: 0.12 | _step_step: 365.92 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.085 | iteration 18290/ 143000 | elapsed time per iteration (ms): 63661.8 | learning rate: 5.797E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.375845E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 06:38:18,733] [INFO] [logging.py:60:log_dist] [Rank 0] step=18300, skipped=18, lr=[0.0005796738297662763, 0.0005796738297662763], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18300 loss: 2.3824 iter time (s): 64.094 samples/sec: 15.977 %comms: 0.0028581016883558524 %optimizer_step 0.05657262515566902 %forward: 22.759146919994897 %backward: 60.97023740019267 [2025-04-07 06:38:18,733] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31852.86 | forward: 145871.97 | backward_microstep: 390792.83 | backward: 390781.28 | backward_inner_microstep: 390763.75 | backward_inner: 390757.02 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.85 | reduce_tied_grads: 0.36 | comms: 18.32 | reduce_grads: 0.22 | step: 362.60 | _step_clipping: 0.14 | _step_step: 360.68 | _step_zero_grad: 0.54 | _step_check_overflow: 0.61 samples/sec: 15.976 | iteration 18300/ 143000 | elapsed time per iteration (ms): 64094.3 | learning rate: 5.797E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.380987E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 06:48:50,558] [INFO] [logging.py:60:log_dist] [Rank 0] step=18310, skipped=18, lr=[0.0005796499760332486, 0.0005796499760332486], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18310 loss: 2.3779 iter time (s): 63.182 samples/sec: 16.207 %comms: 0.003201760799385726 %optimizer_step 0.058306778076254624 %forward: 23.06029601272537 %backward: 61.82478977223232 [2025-04-07 06:48:50,559] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23075.86 | forward: 145699.38 | backward_microstep: 390635.64 | backward: 390620.89 | backward_inner_microstep: 390601.78 | backward_inner: 390594.99 | backward_allreduce_microstep: 9.84 | backward_allreduce: 2.77 | reduce_tied_grads: 0.33 | comms: 20.23 | reduce_grads: 0.20 | step: 368.39 | _step_clipping: 0.13 | _step_step: 366.33 | _step_zero_grad: 0.58 | _step_check_overflow: 0.69 samples/sec: 16.207 | iteration 18310/ 143000 | elapsed time per iteration (ms): 63182.5 | learning rate: 5.796E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.386132E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 06:59:30,698] [INFO] [logging.py:60:log_dist] [Rank 0] step=18320, skipped=18, lr=[0.0005796261088030592, 0.0005796261088030592], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18320 loss: 2.3741 iter time (s): 64.013 samples/sec: 15.997 %comms: 0.002849471734790345 %optimizer_step 0.059379935367049004 %forward: 22.75261540235848 %backward: 61.0215031243186 [2025-04-07 06:59:30,699] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31442.70 | forward: 145647.34 | backward_microstep: 390633.43 | backward: 390619.70 | backward_inner_microstep: 390601.97 | backward_inner: 390595.18 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.81 | reduce_tied_grads: 0.36 | comms: 18.24 | reduce_grads: 0.22 | step: 380.11 | _step_clipping: 0.13 | _step_step: 378.25 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 15.996 | iteration 18320/ 143000 | elapsed time per iteration (ms): 64014.1 | learning rate: 5.796E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.382897E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 07:10:09,828] [INFO] [logging.py:60:log_dist] [Rank 0] step=18330, skipped=18, lr=[0.0005796022280768598, 0.0005796022280768598], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18330 loss: 2.3725 iter time (s): 63.912 samples/sec: 16.022 %comms: 0.002860360125050989 %optimizer_step 0.0582127695880325 %forward: 22.845549229042522 %backward: 61.143860144067276 [2025-04-07 07:10:09,828] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29903.75 | forward: 146011.18 | backward_microstep: 390802.31 | backward: 390784.51 | backward_inner_microstep: 390764.38 | backward_inner: 390757.07 | backward_allreduce_microstep: 10.22 | backward_allreduce: 2.96 | reduce_tied_grads: 0.38 | comms: 18.28 | reduce_grads: 0.23 | step: 372.05 | _step_clipping: 0.16 | _step_step: 369.79 | _step_zero_grad: 0.78 | _step_check_overflow: 0.65 samples/sec: 16.022 | iteration 18330/ 143000 | elapsed time per iteration (ms): 63912.9 | learning rate: 5.796E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.378118E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 07:20:50,123] [INFO] [logging.py:60:log_dist] [Rank 0] step=18340, skipped=18, lr=[0.0005795783338558032, 0.0005795783338558032], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18340 loss: 2.3733 iter time (s): 64.029 samples/sec: 15.993 %comms: 0.002922847775088133 %optimizer_step 0.05890874111938346 %forward: 22.78586198941277 %backward: 61.030242375086075 [2025-04-07 07:20:50,124] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31164.44 | forward: 145895.32 | backward_microstep: 390790.52 | backward: 390769.79 | backward_inner_microstep: 390750.79 | backward_inner: 390743.17 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.09 | reduce_tied_grads: 0.40 | comms: 18.71 | reduce_grads: 0.25 | step: 377.19 | _step_clipping: 0.19 | _step_step: 375.02 | _step_zero_grad: 0.59 | _step_check_overflow: 0.66 samples/sec: 15.993 | iteration 18340/ 143000 | elapsed time per iteration (ms): 64029.6 | learning rate: 5.796E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.373103E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 07:31:27,866] [INFO] [logging.py:60:log_dist] [Rank 0] step=18350, skipped=18, lr=[0.0005795544261410425, 0.0005795544261410425], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18350 loss: 2.3864 iter time (s): 63.774 samples/sec: 16.057 %comms: 0.00283214573322009 %optimizer_step 0.056631400053204904 %forward: 22.80286267126658 %backward: 61.1909471216069 [2025-04-07 07:31:27,867] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29735.87 | forward: 145422.27 | backward_microstep: 390248.22 | backward: 390237.24 | backward_inner_microstep: 390218.80 | backward_inner: 390211.92 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.00 | reduce_tied_grads: 0.34 | comms: 18.06 | reduce_grads: 0.22 | step: 361.16 | _step_clipping: 0.13 | _step_step: 359.34 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 16.057 | iteration 18350/ 143000 | elapsed time per iteration (ms): 63774.3 | learning rate: 5.796E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.374431E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 07:42:14,216] [INFO] [logging.py:60:log_dist] [Rank 0] step=18360, skipped=18, lr=[0.0005795305049337317, 0.0005795305049337317], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18360 loss: 2.3868 iter time (s): 64.634 samples/sec: 15.843 %comms: 0.0028246017896600066 %optimizer_step 0.05567413552049489 %forward: 22.578466873526764 %backward: 60.40687378439166 [2025-04-07 07:42:14,217] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37572.06 | forward: 145934.73 | backward_microstep: 390451.04 | backward: 390436.63 | backward_inner_microstep: 390416.23 | backward_inner: 390409.28 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.98 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.20 | step: 359.85 | _step_clipping: 0.14 | _step_step: 357.97 | _step_zero_grad: 0.50 | _step_check_overflow: 0.65 samples/sec: 15.843 | iteration 18360/ 143000 | elapsed time per iteration (ms): 64635.0 | learning rate: 5.795E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.380094E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 07:53:11,459] [INFO] [logging.py:60:log_dist] [Rank 0] step=18370, skipped=18, lr=[0.0005795065702350252, 0.0005795065702350252], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18370 loss: 2.3862 iter time (s): 65.724 samples/sec: 15.580 %comms: 0.003290479025728814 %optimizer_step 0.055799206699667925 %forward: 22.227784697806765 %backward: 59.39583757911125 [2025-04-07 07:53:11,460] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 48377.72 | forward: 146089.15 | backward_microstep: 390384.41 | backward: 390371.21 | backward_inner_microstep: 390352.93 | backward_inner: 390345.72 | backward_allreduce_microstep: 8.67 | backward_allreduce: 3.02 | reduce_tied_grads: 0.36 | comms: 21.63 | reduce_grads: 0.22 | step: 366.73 | _step_clipping: 0.13 | _step_step: 364.57 | _step_zero_grad: 0.58 | _step_check_overflow: 0.79 samples/sec: 15.580 | iteration 18370/ 143000 | elapsed time per iteration (ms): 65724.3 | learning rate: 5.795E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.384374E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 08:03:54,571] [INFO] [logging.py:60:log_dist] [Rank 0] step=18380, skipped=18, lr=[0.0005794826220460784, 0.0005794826220460784], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18380 loss: 2.3767 iter time (s): 64.310 samples/sec: 15.923 %comms: 0.00284791512130257 %optimizer_step 0.05627252502835219 %forward: 22.667318227610515 %backward: 60.709959640795276 [2025-04-07 08:03:54,571] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34484.24 | forward: 145774.60 | backward_microstep: 390443.77 | backward: 390428.62 | backward_inner_microstep: 390409.95 | backward_inner: 390402.81 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.04 | reduce_tied_grads: 0.37 | comms: 18.32 | reduce_grads: 0.22 | step: 361.89 | _step_clipping: 0.13 | _step_step: 359.99 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 15.923 | iteration 18380/ 143000 | elapsed time per iteration (ms): 64311.1 | learning rate: 5.795E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.375070E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 08:14:35,778] [INFO] [logging.py:60:log_dist] [Rank 0] step=18390, skipped=18, lr=[0.0005794586603680468, 0.0005794586603680468], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18390 loss: 2.3714 iter time (s): 64.120 samples/sec: 15.970 %comms: 0.0028200420160752774 %optimizer_step 0.05679394024231391 %forward: 22.738506771369607 %backward: 60.869164918495564 [2025-04-07 08:14:35,779] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32683.21 | forward: 145799.57 | backward_microstep: 390305.72 | backward: 390293.79 | backward_inner_microstep: 390275.01 | backward_inner: 390268.13 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.07 | reduce_tied_grads: 0.32 | comms: 18.08 | reduce_grads: 0.21 | step: 364.16 | _step_clipping: 0.12 | _step_step: 362.19 | _step_zero_grad: 0.58 | _step_check_overflow: 0.66 samples/sec: 15.970 | iteration 18390/ 143000 | elapsed time per iteration (ms): 64120.7 | learning rate: 5.795E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.368800E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 08:25:19,505] [INFO] [logging.py:60:log_dist] [Rank 0] step=18400, skipped=18, lr=[0.0005794346852020873, 0.0005794346852020873], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18400 loss: 2.3665 iter time (s): 64.372 samples/sec: 15.908 %comms: 0.0028065241011485907 %optimizer_step 0.05702105110264345 %forward: 22.656638284252935 %backward: 60.678220269773156 [2025-04-07 08:25:19,505] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34840.59 | forward: 145845.40 | backward_microstep: 390615.43 | backward: 390598.07 | backward_inner_microstep: 390577.85 | backward_inner: 390570.65 | backward_allreduce_microstep: 8.69 | backward_allreduce: 2.95 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.21 | step: 367.06 | _step_clipping: 0.15 | _step_step: 365.10 | _step_zero_grad: 0.64 | _step_check_overflow: 0.52 samples/sec: 15.907 | iteration 18400/ 143000 | elapsed time per iteration (ms): 64372.7 | learning rate: 5.794E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.369975E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 08:35:52,699] [INFO] [logging.py:60:log_dist] [Rank 0] step=18410, skipped=18, lr=[0.0005794106965493569, 0.0005794106965493569], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18410 loss: 2.3959 iter time (s): 63.319 samples/sec: 16.172 %comms: 0.002907124195382769 %optimizer_step 0.056944797086424984 %forward: 22.987206548702122 %backward: 61.67368204718626 [2025-04-07 08:35:52,700] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24688.71 | forward: 145552.40 | backward_microstep: 390527.92 | backward: 390510.81 | backward_inner_microstep: 390489.81 | backward_inner: 390480.97 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.41 | reduce_grads: 0.20 | step: 360.57 | _step_clipping: 0.12 | _step_step: 358.65 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 16.172 | iteration 18410/ 143000 | elapsed time per iteration (ms): 63319.5 | learning rate: 5.794E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.371722E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 08:46:32,443] [INFO] [logging.py:60:log_dist] [Rank 0] step=18420, skipped=18, lr=[0.0005793866944110133, 0.0005793866944110133], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18420 loss: 2.3599 iter time (s): 63.974 samples/sec: 16.007 %comms: 0.0028494479187949067 %optimizer_step 0.05710288709869701 %forward: 22.81761925981837 %backward: 61.06264708174355 [2025-04-07 08:46:32,444] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30625.77 | forward: 145973.03 | backward_microstep: 390659.66 | backward: 390641.10 | backward_inner_microstep: 390622.50 | backward_inner: 390613.43 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.07 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.22 | step: 365.31 | _step_clipping: 0.14 | _step_step: 363.43 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.006 | iteration 18420/ 143000 | elapsed time per iteration (ms): 63974.4 | learning rate: 5.794E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.374309E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 08:57:11,897] [INFO] [logging.py:60:log_dist] [Rank 0] step=18430, skipped=18, lr=[0.0005793626787882149, 0.0005793626787882149], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18430 loss: 2.3790 iter time (s): 63.945 samples/sec: 16.014 %comms: 0.002840115513183521 %optimizer_step 0.05551865992972162 %forward: 22.776078303854543 %backward: 61.05309985114798 [2025-04-07 08:57:11,897] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30994.10 | forward: 145641.15 | backward_microstep: 390416.26 | backward: 390402.75 | backward_inner_microstep: 390384.23 | backward_inner: 390377.00 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.05 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.20 | step: 355.01 | _step_clipping: 0.13 | _step_step: 353.26 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.014 | iteration 18430/ 143000 | elapsed time per iteration (ms): 63945.3 | learning rate: 5.794E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.375089E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 09:07:45,205] [INFO] [logging.py:60:log_dist] [Rank 0] step=18440, skipped=18, lr=[0.000579338649682121, 0.000579338649682121], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18440 loss: 2.3886 iter time (s): 63.330 samples/sec: 16.169 %comms: 0.002895571942183907 %optimizer_step 0.05660972275759637 %forward: 22.98022958261613 %backward: 61.673894367713444 [2025-04-07 09:07:45,206] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24732.90 | forward: 145534.35 | backward_microstep: 390598.01 | backward: 390582.25 | backward_inner_microstep: 390564.24 | backward_inner: 390557.05 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.34 | reduce_grads: 0.21 | step: 358.51 | _step_clipping: 0.15 | _step_step: 356.41 | _step_zero_grad: 0.54 | _step_check_overflow: 0.75 samples/sec: 16.169 | iteration 18440/ 143000 | elapsed time per iteration (ms): 63330.8 | learning rate: 5.793E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.382512E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 09:18:35,680] [INFO] [logging.py:60:log_dist] [Rank 0] step=18450, skipped=18, lr=[0.0005793146070938914, 0.0005793146070938914], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18450 loss: 2.4000 iter time (s): 65.047 samples/sec: 15.743 %comms: 0.0028227134502595 %optimizer_step 0.058276381833737866 %forward: 22.429620119527353 %backward: 60.06750203025912 [2025-04-07 09:18:35,681] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41361.11 | forward: 145897.54 | backward_microstep: 390738.17 | backward: 390720.00 | backward_inner_microstep: 390701.36 | backward_inner: 390693.98 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.00 | reduce_tied_grads: 0.39 | comms: 18.36 | reduce_grads: 0.24 | step: 379.07 | _step_clipping: 0.12 | _step_step: 377.02 | _step_zero_grad: 0.59 | _step_check_overflow: 0.66 samples/sec: 15.742 | iteration 18450/ 143000 | elapsed time per iteration (ms): 65047.5 | learning rate: 5.793E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.379924E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 09:29:08,635] [INFO] [logging.py:60:log_dist] [Rank 0] step=18460, skipped=18, lr=[0.0005792905510246862, 0.0005792905510246862], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18460 loss: 2.3846 iter time (s): 63.295 samples/sec: 16.178 %comms: 0.002849238302409227 %optimizer_step 0.05561075373268128 %forward: 23.06640620543121 %backward: 61.68942278840645 [2025-04-07 09:29:08,635] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24058.39 | forward: 145998.54 | backward_microstep: 390477.48 | backward: 390462.46 | backward_inner_microstep: 390443.97 | backward_inner: 390437.06 | backward_allreduce_microstep: 8.77 | backward_allreduce: 3.05 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.21 | step: 351.99 | _step_clipping: 0.13 | _step_step: 350.16 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.178 | iteration 18460/ 143000 | elapsed time per iteration (ms): 63295.5 | learning rate: 5.793E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.390952E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 09:39:44,801] [INFO] [logging.py:60:log_dist] [Rank 0] step=18470, skipped=18, lr=[0.0005792664814756667, 0.0005792664814756667], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18470 loss: 2.3766 iter time (s): 63.616 samples/sec: 16.097 %comms: 0.0028934706888248604 %optimizer_step 0.05695064641769219 %forward: 22.89968513154897 %backward: 61.376447051750425 [2025-04-07 09:39:44,802] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27602.99 | forward: 145678.66 | backward_microstep: 390468.43 | backward: 390452.47 | backward_inner_microstep: 390434.21 | backward_inner: 390427.00 | backward_allreduce_microstep: 8.59 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.41 | reduce_grads: 0.22 | step: 362.30 | _step_clipping: 0.14 | _step_step: 360.19 | _step_zero_grad: 0.60 | _step_check_overflow: 0.68 samples/sec: 16.096 | iteration 18470/ 143000 | elapsed time per iteration (ms): 63616.7 | learning rate: 5.793E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.384012E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 09:50:27,651] [INFO] [logging.py:60:log_dist] [Rank 0] step=18480, skipped=18, lr=[0.0005792423984479945, 0.0005792423984479945], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18480 loss: 2.3644 iter time (s): 64.284 samples/sec: 15.929 %comms: 0.002864983165104794 %optimizer_step 0.055700757320959374 %forward: 22.701631325523575 %backward: 60.784948182521006 [2025-04-07 09:50:27,652] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33672.40 | forward: 145935.96 | backward_microstep: 390770.89 | backward: 390752.10 | backward_inner_microstep: 390733.06 | backward_inner: 390723.88 | backward_allreduce_microstep: 9.05 | backward_allreduce: 3.22 | reduce_tied_grads: 0.39 | comms: 18.42 | reduce_grads: 0.23 | step: 358.07 | _step_clipping: 0.17 | _step_step: 356.15 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 15.929 | iteration 18480/ 143000 | elapsed time per iteration (ms): 64285.0 | learning rate: 5.792E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.374004E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 10:01:09,280] [INFO] [logging.py:60:log_dist] [Rank 0] step=18490, skipped=18, lr=[0.000579218301942832, 0.000579218301942832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18490 loss: 2.3945 iter time (s): 64.162 samples/sec: 15.960 %comms: 0.002849510588520865 %optimizer_step 0.05749869065016326 %forward: 22.712289774369708 %backward: 60.88051322656533 [2025-04-07 10:01:09,281] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32792.21 | forward: 145727.38 | backward_microstep: 390640.20 | backward: 390623.66 | backward_inner_microstep: 390605.36 | backward_inner: 390598.20 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.98 | reduce_tied_grads: 0.35 | comms: 18.28 | reduce_grads: 0.23 | step: 368.93 | _step_clipping: 0.14 | _step_step: 366.84 | _step_zero_grad: 0.56 | _step_check_overflow: 0.73 samples/sec: 15.959 | iteration 18490/ 143000 | elapsed time per iteration (ms): 64163.0 | learning rate: 5.792E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.375899E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 10:11:56,694] [INFO] [logging.py:60:log_dist] [Rank 0] step=18500, skipped=18, lr=[0.0005791941919613423, 0.0005791941919613423], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18500 loss: 2.3712 iter time (s): 64.741 samples/sec: 15.817 %comms: 0.002851968463652101 %optimizer_step 0.05573717792101875 %forward: 22.49576257782622 %backward: 60.327526911074145 [2025-04-07 10:11:56,695] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38738.24 | forward: 145639.19 | backward_microstep: 390580.52 | backward: 390564.77 | backward_inner_microstep: 390546.82 | backward_inner: 390539.54 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.92 | reduce_tied_grads: 0.35 | comms: 18.46 | reduce_grads: 0.22 | step: 360.85 | _step_clipping: 0.13 | _step_step: 358.66 | _step_zero_grad: 0.56 | _step_check_overflow: 0.70 samples/sec: 15.817 | iteration 18500/ 143000 | elapsed time per iteration (ms): 64741.4 | learning rate: 5.792E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.378860E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 10:22:36,383] [INFO] [logging.py:60:log_dist] [Rank 0] step=18510, skipped=18, lr=[0.0005791700685046888, 0.0005791700685046888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18510 loss: 2.3672 iter time (s): 63.968 samples/sec: 16.008 %comms: 0.0028617368140319796 %optimizer_step 0.05652609479322486 %forward: 22.785920367136477 %backward: 61.080767250092926 [2025-04-07 10:22:36,384] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30704.83 | forward: 145757.44 | backward_microstep: 390741.48 | backward: 390722.71 | backward_inner_microstep: 390703.89 | backward_inner: 390696.40 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.29 | reduce_tied_grads: 0.34 | comms: 18.31 | reduce_grads: 0.22 | step: 361.59 | _step_clipping: 0.16 | _step_step: 359.82 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 16.008 | iteration 18510/ 143000 | elapsed time per iteration (ms): 63968.8 | learning rate: 5.792E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.378326E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 10:33:14,291] [INFO] [logging.py:60:log_dist] [Rank 0] step=18520, skipped=18, lr=[0.0005791459315740361, 0.0005791459315740361], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18520 loss: 2.3786 iter time (s): 63.790 samples/sec: 16.053 %comms: 0.0028914008033918653 %optimizer_step 0.056905665154249 %forward: 22.85127980194038 %backward: 61.223001670206024 [2025-04-07 10:33:14,292] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29114.73 | forward: 145768.74 | backward_microstep: 390558.15 | backward: 390542.67 | backward_inner_microstep: 390524.59 | backward_inner: 390517.26 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.44 | reduce_grads: 0.23 | step: 363.00 | _step_clipping: 0.13 | _step_step: 360.99 | _step_zero_grad: 0.56 | _step_check_overflow: 0.68 samples/sec: 16.052 | iteration 18520/ 143000 | elapsed time per iteration (ms): 63790.8 | learning rate: 5.791E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.370307E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 10:43:55,326] [INFO] [logging.py:60:log_dist] [Rank 0] step=18530, skipped=18, lr=[0.0005791217811705489, 0.0005791217811705489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18530 loss: 2.3778 iter time (s): 64.103 samples/sec: 15.974 %comms: 0.0028355294013506314 %optimizer_step 0.05597751154939489 %forward: 22.724939254329932 %backward: 60.91271480423956 [2025-04-07 10:43:55,327] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32449.40 | forward: 145673.37 | backward_microstep: 390483.04 | backward: 390467.95 | backward_inner_microstep: 390449.37 | backward_inner: 390442.24 | backward_allreduce_microstep: 8.84 | backward_allreduce: 3.03 | reduce_tied_grads: 0.33 | comms: 18.18 | reduce_grads: 0.22 | step: 358.83 | _step_clipping: 0.14 | _step_step: 356.99 | _step_zero_grad: 0.56 | _step_check_overflow: 0.53 samples/sec: 15.974 | iteration 18530/ 143000 | elapsed time per iteration (ms): 64103.5 | learning rate: 5.791E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.371550E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 10:54:34,513] [INFO] [logging.py:60:log_dist] [Rank 0] step=18540, skipped=18, lr=[0.0005790976172953931, 0.0005790976172953931], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18540 loss: 2.3697 iter time (s): 63.918 samples/sec: 16.021 %comms: 0.003103455024843808 %optimizer_step 0.05785837437658292 %forward: 22.808133189942453 %backward: 61.125507792575405 [2025-04-07 10:54:34,513] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30196.13 | forward: 145785.03 | backward_microstep: 390720.48 | backward: 390702.03 | backward_inner_microstep: 390682.82 | backward_inner: 390675.18 | backward_allreduce_microstep: 8.94 | backward_allreduce: 3.08 | reduce_tied_grads: 0.37 | comms: 19.84 | reduce_grads: 0.23 | step: 369.82 | _step_clipping: 0.13 | _step_step: 367.74 | _step_zero_grad: 0.60 | _step_check_overflow: 0.68 samples/sec: 16.020 | iteration 18540/ 143000 | elapsed time per iteration (ms): 63918.7 | learning rate: 5.791E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.370366E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 11:05:07,701] [INFO] [logging.py:60:log_dist] [Rank 0] step=18550, skipped=18, lr=[0.0005790734399497346, 0.0005790734399497346], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18550 loss: 2.3699 iter time (s): 63.318 samples/sec: 16.172 %comms: 0.0029034997040546457 %optimizer_step 0.05712009670028385 %forward: 23.000924977965635 %backward: 61.68621436263817 [2025-04-07 11:05:07,702] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24494.46 | forward: 145637.87 | backward_microstep: 390602.65 | backward: 390586.41 | backward_inner_microstep: 390568.05 | backward_inner: 390560.77 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.38 | reduce_grads: 0.20 | step: 361.67 | _step_clipping: 0.12 | _step_step: 359.81 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.172 | iteration 18550/ 143000 | elapsed time per iteration (ms): 63318.9 | learning rate: 5.791E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.367656E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 11:15:54,923] [INFO] [logging.py:60:log_dist] [Rank 0] step=18560, skipped=18, lr=[0.0005790492491347405, 0.0005790492491347405], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18560 loss: 2.3630 iter time (s): 64.722 samples/sec: 15.822 %comms: 0.00286854414473552 %optimizer_step 0.05604800773378068 %forward: 22.525985063723084 %backward: 60.32561579497072 [2025-04-07 11:15:54,924] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38544.40 | forward: 145791.61 | backward_microstep: 390453.12 | backward: 390436.58 | backward_inner_microstep: 390418.54 | backward_inner: 390411.49 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.57 | reduce_grads: 0.22 | step: 362.75 | _step_clipping: 0.13 | _step_step: 360.82 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 15.821 | iteration 18560/ 143000 | elapsed time per iteration (ms): 64722.2 | learning rate: 5.790E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.366927E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 11:26:36,603] [INFO] [logging.py:60:log_dist] [Rank 0] step=18570, skipped=18, lr=[0.0005790250448515783, 0.0005790250448515783], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18570 loss: 2.3769 iter time (s): 64.167 samples/sec: 15.958 %comms: 0.0028373583312008683 %optimizer_step 0.05733604740804128 %forward: 22.71713670472696 %backward: 60.840156225713635 [2025-04-07 11:26:36,604] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33045.33 | forward: 145770.01 | backward_microstep: 390411.92 | backward: 390395.59 | backward_inner_microstep: 390377.41 | backward_inner: 390370.54 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 367.91 | _step_clipping: 0.13 | _step_step: 366.02 | _step_zero_grad: 0.56 | _step_check_overflow: 0.58 samples/sec: 15.958 | iteration 18570/ 143000 | elapsed time per iteration (ms): 64168.0 | learning rate: 5.790E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.377105E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 11:37:14,770] [INFO] [logging.py:60:log_dist] [Rank 0] step=18580, skipped=18, lr=[0.0005790008271014163, 0.0005790008271014163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18580 loss: 2.3817 iter time (s): 63.816 samples/sec: 16.046 %comms: 0.002834446132583578 %optimizer_step 0.05650903313134163 %forward: 22.841392901719203 %backward: 61.15837921729999 [2025-04-07 11:37:14,771] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29671.04 | forward: 145764.95 | backward_microstep: 390301.51 | backward: 390289.16 | backward_inner_microstep: 390267.51 | backward_inner: 390258.90 | backward_allreduce_microstep: 8.72 | backward_allreduce: 3.00 | reduce_tied_grads: 0.36 | comms: 18.09 | reduce_grads: 0.25 | step: 360.62 | _step_clipping: 0.13 | _step_step: 358.81 | _step_zero_grad: 0.52 | _step_check_overflow: 0.52 samples/sec: 16.046 | iteration 18580/ 143000 | elapsed time per iteration (ms): 63816.7 | learning rate: 5.790E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.383043E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 11:47:51,700] [INFO] [logging.py:60:log_dist] [Rank 0] step=18590, skipped=18, lr=[0.0005789765958854234, 0.0005789765958854234], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18590 loss: 2.3891 iter time (s): 63.692 samples/sec: 16.077 %comms: 0.0028990238399484477 %optimizer_step 0.059982759488828245 %forward: 22.871859029036045 %backward: 61.319467428755004 [2025-04-07 11:47:51,701] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28188.73 | forward: 145676.25 | backward_microstep: 390574.58 | backward: 390558.12 | backward_inner_microstep: 390539.71 | backward_inner: 390532.56 | backward_allreduce_microstep: 8.51 | backward_allreduce: 3.06 | reduce_tied_grads: 0.37 | comms: 18.46 | reduce_grads: 0.23 | step: 382.04 | _step_clipping: 0.13 | _step_step: 379.99 | _step_zero_grad: 0.57 | _step_check_overflow: 0.70 samples/sec: 16.077 | iteration 18590/ 143000 | elapsed time per iteration (ms): 63693.0 | learning rate: 5.790E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.376424E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 11:58:28,468] [INFO] [logging.py:60:log_dist] [Rank 0] step=18600, skipped=18, lr=[0.0005789523512047689, 0.0005789523512047689], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18600 loss: 2.3836 iter time (s): 63.676 samples/sec: 16.081 %comms: 0.0028580820359158676 %optimizer_step 0.0557755647238252 %forward: 22.851016062448 %backward: 61.275213570969235 [2025-04-07 11:58:28,469] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28652.55 | forward: 145506.79 | backward_microstep: 390188.11 | backward: 390177.81 | backward_inner_microstep: 390160.73 | backward_inner: 390154.08 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.83 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.21 | step: 355.16 | _step_clipping: 0.13 | _step_step: 353.18 | _step_zero_grad: 0.49 | _step_check_overflow: 0.73 samples/sec: 16.081 | iteration 18600/ 143000 | elapsed time per iteration (ms): 63676.8 | learning rate: 5.790E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.373299E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 12:09:01,389] [INFO] [logging.py:60:log_dist] [Rank 0] step=18610, skipped=18, lr=[0.0005789280930606232, 0.0005789280930606232], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18610 loss: 2.3715 iter time (s): 63.291 samples/sec: 16.179 %comms: 0.00284660638766532 %optimizer_step 0.05632225098942399 %forward: 23.01086788301179 %backward: 61.68768250324368 [2025-04-07 12:09:01,390] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24400.50 | forward: 145639.06 | backward_microstep: 390443.31 | backward: 390430.12 | backward_inner_microstep: 390412.39 | backward_inner: 390403.74 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.20 | step: 356.47 | _step_clipping: 0.13 | _step_step: 354.65 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.179 | iteration 18610/ 143000 | elapsed time per iteration (ms): 63292.0 | learning rate: 5.789E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.372738E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 12:19:32,491] [INFO] [logging.py:60:log_dist] [Rank 0] step=18620, skipped=18, lr=[0.0005789038214541567, 0.0005789038214541567], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18620 loss: 2.3698 iter time (s): 63.110 samples/sec: 16.226 %comms: 0.0029087919255212295 %optimizer_step 0.05741856885814301 %forward: 23.069811269150737 %backward: 61.88335490658381 [2025-04-07 12:19:32,492] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22487.95 | forward: 145592.72 | backward_microstep: 390560.69 | backward: 390543.54 | backward_inner_microstep: 390525.78 | backward_inner: 390518.90 | backward_allreduce_microstep: 8.52 | backward_allreduce: 3.11 | reduce_tied_grads: 0.35 | comms: 18.36 | reduce_grads: 0.23 | step: 362.37 | _step_clipping: 0.13 | _step_step: 360.35 | _step_zero_grad: 0.52 | _step_check_overflow: 0.74 samples/sec: 16.226 | iteration 18620/ 143000 | elapsed time per iteration (ms): 63110.3 | learning rate: 5.789E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.372242E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 12:30:13,583] [INFO] [logging.py:60:log_dist] [Rank 0] step=18630, skipped=18, lr=[0.0005788795363865414, 0.0005788795363865414], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18630 loss: 2.3564 iter time (s): 64.109 samples/sec: 15.973 %comms: 0.002871017435821767 %optimizer_step 0.0564151820788443 %forward: 22.730280718221387 %backward: 60.911467650062875 [2025-04-07 12:30:13,584] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32396.25 | forward: 145720.53 | backward_microstep: 390510.95 | backward: 390494.57 | backward_inner_microstep: 390476.31 | backward_inner: 390468.98 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.41 | reduce_grads: 0.20 | step: 361.67 | _step_clipping: 0.14 | _step_step: 359.67 | _step_zero_grad: 0.61 | _step_check_overflow: 0.54 samples/sec: 15.973 | iteration 18630/ 143000 | elapsed time per iteration (ms): 64109.2 | learning rate: 5.789E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.368296E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 12:40:43,711] [INFO] [logging.py:60:log_dist] [Rank 0] step=18640, skipped=18, lr=[0.0005788552378589491, 0.0005788552378589491], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18640 loss: 2.3787 iter time (s): 63.012 samples/sec: 16.251 %comms: 0.002934744867809434 %optimizer_step 0.05888563325963011 %forward: 23.070871655643774 %backward: 61.92578111086804 [2025-04-07 12:40:43,712] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22093.01 | forward: 145374.54 | backward_microstep: 390217.46 | backward: 390207.70 | backward_inner_microstep: 390190.07 | backward_inner: 390183.38 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.98 | reduce_tied_grads: 0.36 | comms: 18.49 | reduce_grads: 0.22 | step: 371.05 | _step_clipping: 0.14 | _step_step: 369.20 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.251 | iteration 18640/ 143000 | elapsed time per iteration (ms): 63012.8 | learning rate: 5.789E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.367907E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 12:51:27,120] [INFO] [logging.py:60:log_dist] [Rank 0] step=18650, skipped=18, lr=[0.0005788309258725525, 0.0005788309258725525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18650 loss: 2.3841 iter time (s): 64.340 samples/sec: 15.915 %comms: 0.0028507470784386426 %optimizer_step 0.057286079554408495 %forward: 22.698644152769546 %backward: 60.705846140176526 [2025-04-07 12:51:27,121] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34289.70 | forward: 146043.66 | backward_microstep: 390598.94 | backward: 390582.97 | backward_inner_microstep: 390564.46 | backward_inner: 390555.63 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.99 | reduce_tied_grads: 0.31 | comms: 18.34 | reduce_grads: 0.21 | step: 368.58 | _step_clipping: 0.14 | _step_step: 366.63 | _step_zero_grad: 0.60 | _step_check_overflow: 0.55 samples/sec: 15.915 | iteration 18650/ 143000 | elapsed time per iteration (ms): 64340.9 | learning rate: 5.788E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.378365E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 13:02:00,301] [INFO] [logging.py:60:log_dist] [Rank 0] step=18660, skipped=18, lr=[0.0005788066004285251, 0.0005788066004285251], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18660 loss: 2.3945 iter time (s): 63.318 samples/sec: 16.172 %comms: 0.0029062828570902647 %optimizer_step 0.05944616079565781 %forward: 22.991510751276305 %backward: 61.686939633525085 [2025-04-07 13:02:00,302] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24510.24 | forward: 145576.54 | backward_microstep: 390601.99 | backward: 390586.38 | backward_inner_microstep: 390569.08 | backward_inner: 390560.35 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.76 | reduce_tied_grads: 0.33 | comms: 18.40 | reduce_grads: 0.21 | step: 376.40 | _step_clipping: 0.12 | _step_step: 374.39 | _step_zero_grad: 0.53 | _step_check_overflow: 0.70 samples/sec: 16.172 | iteration 18660/ 143000 | elapsed time per iteration (ms): 63318.1 | learning rate: 5.788E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.387324E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 13:12:41,175] [INFO] [logging.py:60:log_dist] [Rank 0] step=18670, skipped=18, lr=[0.0005787822615280411, 0.0005787822615280411], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18670 loss: 2.3794 iter time (s): 64.087 samples/sec: 15.978 %comms: 0.002841972260787486 %optimizer_step 0.05728219199388922 %forward: 22.782744587143757 %backward: 60.90967583396012 [2025-04-07 13:12:41,175] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32006.60 | forward: 146007.17 | backward_microstep: 390362.85 | backward: 390350.22 | backward_inner_microstep: 390331.84 | backward_inner: 390324.85 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.02 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.23 | step: 367.10 | _step_clipping: 0.15 | _step_step: 365.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.64 samples/sec: 15.978 | iteration 18670/ 143000 | elapsed time per iteration (ms): 64087.4 | learning rate: 5.788E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.371465E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 13:23:16,988] [INFO] [logging.py:60:log_dist] [Rank 0] step=18680, skipped=18, lr=[0.000578757909172275, 0.000578757909172275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18680 loss: 2.3664 iter time (s): 63.581 samples/sec: 16.106 %comms: 0.003057410960099894 %optimizer_step 0.05720557876051502 %forward: 22.893087986764638 %backward: 61.413213905327936 [2025-04-07 13:23:16,989] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27286.95 | forward: 145555.76 | backward_microstep: 390485.14 | backward: 390469.26 | backward_inner_microstep: 390450.59 | backward_inner: 390441.72 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.04 | reduce_tied_grads: 0.33 | comms: 19.44 | reduce_grads: 0.24 | step: 363.72 | _step_clipping: 0.12 | _step_step: 361.56 | _step_zero_grad: 0.60 | _step_check_overflow: 0.72 samples/sec: 16.105 | iteration 18680/ 143000 | elapsed time per iteration (ms): 63581.3 | learning rate: 5.788E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.363106E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 13:33:44,716] [INFO] [logging.py:60:log_dist] [Rank 0] step=18690, skipped=18, lr=[0.0005787335433624022, 0.0005787335433624022], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18690 loss: 2.3765 iter time (s): 62.772 samples/sec: 16.313 %comms: 0.002993407719120145 %optimizer_step 0.059199639825428904 %forward: 23.203435447577867 %backward: 62.23172438378687 [2025-04-07 13:33:44,717] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18903.95 | forward: 145652.81 | backward_microstep: 390659.37 | backward: 390641.52 | backward_inner_microstep: 390623.46 | backward_inner: 390616.29 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.92 | reduce_tied_grads: 0.36 | comms: 18.79 | reduce_grads: 0.23 | step: 371.61 | _step_clipping: 0.14 | _step_step: 369.41 | _step_zero_grad: 0.57 | _step_check_overflow: 0.79 samples/sec: 16.313 | iteration 18690/ 143000 | elapsed time per iteration (ms): 62772.8 | learning rate: 5.787E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.368050E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 13:44:15,966] [INFO] [logging.py:60:log_dist] [Rank 0] step=18700, skipped=18, lr=[0.0005787091640995986, 0.0005787091640995986], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18700 loss: 2.3759 iter time (s): 63.124 samples/sec: 16.222 %comms: 0.0028869230625470773 %optimizer_step 0.05790340130476693 %forward: 23.060357570691078 %backward: 61.85547864329462 [2025-04-07 13:44:15,967] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22743.94 | forward: 145567.09 | backward_microstep: 390476.14 | backward: 390458.90 | backward_inner_microstep: 390439.53 | backward_inner: 390432.57 | backward_allreduce_microstep: 9.99 | backward_allreduce: 2.83 | reduce_tied_grads: 0.34 | comms: 18.22 | reduce_grads: 0.22 | step: 365.51 | _step_clipping: 0.13 | _step_step: 363.61 | _step_zero_grad: 0.57 | _step_check_overflow: 0.54 samples/sec: 16.222 | iteration 18700/ 143000 | elapsed time per iteration (ms): 63125.0 | learning rate: 5.787E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.383630E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 13:54:55,330] [INFO] [logging.py:60:log_dist] [Rank 0] step=18710, skipped=18, lr=[0.000578684771385041, 0.000578684771385041], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18710 loss: 2.3868 iter time (s): 63.936 samples/sec: 16.016 %comms: 0.003276369458303452 %optimizer_step 0.05648182610771406 %forward: 22.825079112449163 %backward: 61.09475409299292 [2025-04-07 13:54:55,330] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30342.28 | forward: 145933.72 | backward_microstep: 390633.77 | backward: 390613.54 | backward_inner_microstep: 390593.33 | backward_inner: 390586.09 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.13 | reduce_tied_grads: 0.32 | comms: 20.95 | reduce_grads: 0.21 | step: 361.12 | _step_clipping: 0.14 | _step_step: 359.23 | _step_zero_grad: 0.57 | _step_check_overflow: 0.58 samples/sec: 16.016 | iteration 18710/ 143000 | elapsed time per iteration (ms): 63936.3 | learning rate: 5.787E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.381957E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 14:05:28,207] [INFO] [logging.py:60:log_dist] [Rank 0] step=18720, skipped=18, lr=[0.0005786603652199068, 0.0005786603652199068], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18720 loss: 2.3976 iter time (s): 63.287 samples/sec: 16.180 %comms: 0.0029306951807578224 %optimizer_step 0.057072871953793375 %forward: 23.016138880338065 %backward: 61.69074658293766 [2025-04-07 14:05:28,208] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24318.61 | forward: 145662.59 | backward_microstep: 390438.78 | backward: 390423.16 | backward_inner_microstep: 390405.35 | backward_inner: 390398.33 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.55 | reduce_grads: 0.19 | step: 361.20 | _step_clipping: 0.12 | _step_step: 359.29 | _step_zero_grad: 0.59 | _step_check_overflow: 0.52 samples/sec: 16.180 | iteration 18720/ 143000 | elapsed time per iteration (ms): 63287.8 | learning rate: 5.787E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.374289E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 14:16:04,942] [INFO] [logging.py:60:log_dist] [Rank 0] step=18730, skipped=18, lr=[0.0005786359456053737, 0.0005786359456053737], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18730 loss: 2.3757 iter time (s): 63.673 samples/sec: 16.082 %comms: 0.0028424715191155544 %optimizer_step 0.05740794952525459 %forward: 22.88747867365179 %backward: 61.32878865702619 [2025-04-07 14:16:04,942] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28023.72 | forward: 145731.14 | backward_microstep: 390515.47 | backward: 390497.98 | backward_inner_microstep: 390480.15 | backward_inner: 390473.13 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.20 | step: 365.53 | _step_clipping: 0.11 | _step_step: 363.66 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.082 | iteration 18730/ 143000 | elapsed time per iteration (ms): 63673.5 | learning rate: 5.786E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.379916E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 14:26:43,478] [INFO] [logging.py:60:log_dist] [Rank 0] step=18740, skipped=18, lr=[0.0005786115125426206, 0.0005786115125426206], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18740 loss: 2.3649 iter time (s): 63.853 samples/sec: 16.037 %comms: 0.0028603256519941687 %optimizer_step 0.05745149178282096 %forward: 22.85294434277532 %backward: 61.14432965654268 [2025-04-07 14:26:43,479] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29680.64 | forward: 145923.05 | backward_microstep: 390439.11 | backward: 390425.27 | backward_inner_microstep: 390407.12 | backward_inner: 390399.93 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.94 | reduce_tied_grads: 0.36 | comms: 18.26 | reduce_grads: 0.21 | step: 366.85 | _step_clipping: 0.14 | _step_step: 364.98 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.037 | iteration 18740/ 143000 | elapsed time per iteration (ms): 63853.6 | learning rate: 5.786E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.379009E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 14:37:17,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=18750, skipped=18, lr=[0.0005785870660328263, 0.0005785870660328263], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18750 loss: 2.3817 iter time (s): 63.428 samples/sec: 16.144 %comms: 0.002874331052475764 %optimizer_step 0.05801049445902456 %forward: 22.926070319506334 %backward: 61.51983823068774 [2025-04-07 14:37:17,768] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26190.62 | forward: 145416.16 | backward_microstep: 390221.47 | backward: 390209.85 | backward_inner_microstep: 390191.57 | backward_inner: 390184.59 | backward_allreduce_microstep: 8.73 | backward_allreduce: 3.02 | reduce_tied_grads: 0.35 | comms: 18.23 | reduce_grads: 0.24 | step: 367.95 | _step_clipping: 0.14 | _step_step: 365.90 | _step_zero_grad: 0.59 | _step_check_overflow: 0.69 samples/sec: 16.144 | iteration 18750/ 143000 | elapsed time per iteration (ms): 63428.9 | learning rate: 5.786E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.382129E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 14:48:04,119] [INFO] [logging.py:60:log_dist] [Rank 0] step=18760, skipped=18, lr=[0.0005785626060771712, 0.0005785626060771712], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18760 loss: 2.3806 iter time (s): 64.635 samples/sec: 15.843 %comms: 0.002793429964123693 %optimizer_step 0.05524505823329129 %forward: 22.560035819762643 %backward: 60.410540821117564 [2025-04-07 14:48:04,119] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37513.83 | forward: 145815.71 | backward_microstep: 390475.53 | backward: 390460.63 | backward_inner_microstep: 390442.17 | backward_inner: 390434.81 | backward_allreduce_microstep: 8.74 | backward_allreduce: 3.03 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.20 | step: 357.07 | _step_clipping: 0.14 | _step_step: 355.30 | _step_zero_grad: 0.54 | _step_check_overflow: 0.51 samples/sec: 15.843 | iteration 18760/ 143000 | elapsed time per iteration (ms): 64635.1 | learning rate: 5.786E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.378623E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 14:58:37,590] [INFO] [logging.py:60:log_dist] [Rank 0] step=18770, skipped=18, lr=[0.0005785381326768355, 0.0005785381326768355], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18770 loss: 2.3560 iter time (s): 63.347 samples/sec: 16.165 %comms: 0.0028410800364064726 %optimizer_step 0.05713063959863436 %forward: 22.97004989474906 %backward: 61.5905182318925 [2025-04-07 14:58:37,591] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25278.40 | forward: 145507.37 | backward_microstep: 390164.53 | backward: 390154.75 | backward_inner_microstep: 390137.33 | backward_inner: 390130.65 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.20 | step: 361.90 | _step_clipping: 0.17 | _step_step: 360.05 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.165 | iteration 18770/ 143000 | elapsed time per iteration (ms): 63347.1 | learning rate: 5.785E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.370060E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 15:09:23,178] [INFO] [logging.py:60:log_dist] [Rank 0] step=18780, skipped=18, lr=[0.0005785136458330006, 0.0005785136458330006], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18780 loss: 2.3760 iter time (s): 64.558 samples/sec: 15.862 %comms: 0.002904534832455418 %optimizer_step 0.0573439924763714 %forward: 22.55919184525359 %backward: 60.47061664404206 [2025-04-07 15:09:23,179] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36947.89 | forward: 145638.01 | backward_microstep: 390403.59 | backward: 390387.22 | backward_inner_microstep: 390368.72 | backward_inner: 390358.01 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.03 | reduce_tied_grads: 0.35 | comms: 18.75 | reduce_grads: 0.24 | step: 370.20 | _step_clipping: 0.13 | _step_step: 368.07 | _step_zero_grad: 0.62 | _step_check_overflow: 0.68 samples/sec: 15.861 | iteration 18780/ 143000 | elapsed time per iteration (ms): 64558.9 | learning rate: 5.785E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.380339E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 15:20:14,442] [INFO] [logging.py:60:log_dist] [Rank 0] step=18790, skipped=18, lr=[0.0005784891455468483, 0.0005784891455468483], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18790 loss: 2.3758 iter time (s): 65.126 samples/sec: 15.723 %comms: 0.0027951695124605533 %optimizer_step 0.055945307541381106 %forward: 22.365902826265774 %backward: 59.92882935814335 [2025-04-07 15:20:14,443] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 42730.73 | forward: 145659.49 | backward_microstep: 390303.93 | backward: 390290.66 | backward_inner_microstep: 390272.58 | backward_inner: 390263.98 | backward_allreduce_microstep: 8.70 | backward_allreduce: 3.09 | reduce_tied_grads: 0.35 | comms: 18.20 | reduce_grads: 0.21 | step: 364.35 | _step_clipping: 0.15 | _step_step: 362.38 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 15.723 | iteration 18790/ 143000 | elapsed time per iteration (ms): 65126.4 | learning rate: 5.785E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.370530E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 15:31:03,451] [INFO] [logging.py:60:log_dist] [Rank 0] step=18800, skipped=18, lr=[0.0005784646318195609, 0.0005784646318195609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18800 loss: 2.3576 iter time (s): 64.900 samples/sec: 15.778 %comms: 0.002791984325524255 %optimizer_step 0.056993716004860424 %forward: 22.452239563256875 %backward: 60.141812327005226 [2025-04-07 15:31:03,452] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40331.65 | forward: 145715.62 | backward_microstep: 390335.04 | backward: 390321.92 | backward_inner_microstep: 390303.58 | backward_inner: 390296.75 | backward_allreduce_microstep: 8.87 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.22 | step: 369.89 | _step_clipping: 0.12 | _step_step: 368.08 | _step_zero_grad: 0.57 | _step_check_overflow: 0.47 samples/sec: 15.778 | iteration 18800/ 143000 | elapsed time per iteration (ms): 64900.9 | learning rate: 5.785E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.359922E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 15:41:40,108] [INFO] [logging.py:60:log_dist] [Rank 0] step=18810, skipped=18, lr=[0.0005784401046523218, 0.0005784401046523218], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18810 loss: 2.3668 iter time (s): 63.665 samples/sec: 16.084 %comms: 0.0028720644100811944 %optimizer_step 0.055767137504002466 %forward: 22.89988295640044 %backward: 61.30899244471537 [2025-04-07 15:41:40,109] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27933.38 | forward: 145792.41 | backward_microstep: 390337.86 | backward: 390324.51 | backward_inner_microstep: 390306.35 | backward_inner: 390299.52 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.99 | reduce_tied_grads: 0.32 | comms: 18.29 | reduce_grads: 0.20 | step: 355.04 | _step_clipping: 0.14 | _step_step: 353.14 | _step_zero_grad: 0.53 | _step_check_overflow: 0.64 samples/sec: 16.084 | iteration 18810/ 143000 | elapsed time per iteration (ms): 63665.7 | learning rate: 5.784E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.368450E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 15:52:22,811] [INFO] [logging.py:60:log_dist] [Rank 0] step=18820, skipped=18, lr=[0.0005784155640463146, 0.0005784155640463146], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18820 loss: 2.3797 iter time (s): 64.270 samples/sec: 15.933 %comms: 0.0028837775210614697 %optimizer_step 0.056343931270811665 %forward: 22.70127578522305 %backward: 60.801441425646715 [2025-04-07 15:52:22,812] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33342.14 | forward: 145900.37 | backward_microstep: 390788.89 | backward: 390768.90 | backward_inner_microstep: 390744.64 | backward_inner: 390736.95 | backward_allreduce_microstep: 12.28 | backward_allreduce: 2.97 | reduce_tied_grads: 0.36 | comms: 18.53 | reduce_grads: 0.23 | step: 362.12 | _step_clipping: 0.15 | _step_step: 360.20 | _step_zero_grad: 0.56 | _step_check_overflow: 0.53 samples/sec: 15.933 | iteration 18820/ 143000 | elapsed time per iteration (ms): 64270.3 | learning rate: 5.784E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.376996E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 16:03:02,557] [INFO] [logging.py:60:log_dist] [Rank 0] step=18830, skipped=18, lr=[0.000578391010002724, 0.000578391010002724], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18830 loss: 2.3651 iter time (s): 63.974 samples/sec: 16.007 %comms: 0.00282421174333174 %optimizer_step 0.05592051801373453 %forward: 22.774631441112856 %backward: 61.02417054258401 [2025-04-07 16:03:02,557] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31009.07 | forward: 145698.32 | backward_microstep: 390409.73 | backward: 390395.73 | backward_inner_microstep: 390377.73 | backward_inner: 390370.61 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.92 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.22 | step: 357.75 | _step_clipping: 0.13 | _step_step: 355.99 | _step_zero_grad: 0.55 | _step_check_overflow: 0.47 samples/sec: 16.006 | iteration 18830/ 143000 | elapsed time per iteration (ms): 63974.6 | learning rate: 5.784E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.373317E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 16:13:39,450] [INFO] [logging.py:60:log_dist] [Rank 0] step=18840, skipped=18, lr=[0.0005783664425227347, 0.0005783664425227347], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18840 loss: 2.3818 iter time (s): 63.689 samples/sec: 16.078 %comms: 0.002859657112512573 %optimizer_step 0.056041754976923146 %forward: 22.85792769191716 %backward: 61.30373695660237 [2025-04-07 16:13:39,451] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28231.15 | forward: 145579.26 | backward_microstep: 390453.24 | backward: 390435.76 | backward_inner_microstep: 390418.02 | backward_inner: 390410.88 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.88 | reduce_tied_grads: 0.31 | comms: 18.21 | reduce_grads: 0.20 | step: 356.92 | _step_clipping: 0.14 | _step_step: 355.16 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.078 | iteration 18840/ 143000 | elapsed time per iteration (ms): 63689.3 | learning rate: 5.784E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.371360E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 16:24:15,898] [INFO] [logging.py:60:log_dist] [Rank 0] step=18850, skipped=18, lr=[0.0005783418616075327, 0.0005783418616075327], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18850 loss: 2.3670 iter time (s): 63.644 samples/sec: 16.089 %comms: 0.0028775772545954524 %optimizer_step 0.0554279481458893 %forward: 22.86727843875436 %backward: 61.33200730397773 [2025-04-07 16:24:15,899] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27919.96 | forward: 145537.07 | backward_microstep: 390355.58 | backward: 390342.93 | backward_inner_microstep: 390322.60 | backward_inner: 390315.82 | backward_allreduce_microstep: 9.68 | backward_allreduce: 4.51 | reduce_tied_grads: 0.31 | comms: 18.31 | reduce_grads: 0.20 | step: 352.77 | _step_clipping: 0.12 | _step_step: 350.93 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.089 | iteration 18850/ 143000 | elapsed time per iteration (ms): 63644.8 | learning rate: 5.783E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.379556E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 16:34:47,878] [INFO] [logging.py:60:log_dist] [Rank 0] step=18860, skipped=18, lr=[0.0005783172672583044, 0.0005783172672583044], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18860 loss: 2.3711 iter time (s): 63.197 samples/sec: 16.203 %comms: 0.0028813615038230716 %optimizer_step 0.05563061888671161 %forward: 23.012875420242487 %backward: 61.755110835056506 [2025-04-07 16:34:47,879] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23593.20 | forward: 145435.40 | backward_microstep: 390287.87 | backward: 390276.28 | backward_inner_microstep: 390259.20 | backward_inner: 390252.65 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.73 | reduce_tied_grads: 0.30 | comms: 18.21 | reduce_grads: 0.19 | step: 351.57 | _step_clipping: 0.11 | _step_step: 349.81 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.203 | iteration 18860/ 143000 | elapsed time per iteration (ms): 63198.0 | learning rate: 5.783E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.378150E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 16:45:29,704] [INFO] [logging.py:60:log_dist] [Rank 0] step=18870, skipped=18, lr=[0.0005782926594762367, 0.0005782926594762367], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18870 loss: 2.3610 iter time (s): 64.182 samples/sec: 15.955 %comms: 0.0028376414803218354 %optimizer_step 0.05848898075209645 %forward: 22.727128047166335 %backward: 60.84458324908321 [2025-04-07 16:45:29,705] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32713.22 | forward: 145867.30 | backward_microstep: 390526.26 | backward: 390512.84 | backward_inner_microstep: 390495.06 | backward_inner: 390488.21 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.21 | reduce_grads: 0.22 | step: 375.39 | _step_clipping: 0.14 | _step_step: 373.35 | _step_zero_grad: 0.58 | _step_check_overflow: 0.67 samples/sec: 15.954 | iteration 18870/ 143000 | elapsed time per iteration (ms): 64182.6 | learning rate: 5.783E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.364459E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 16:56:05,291] [INFO] [logging.py:60:log_dist] [Rank 0] step=18880, skipped=18, lr=[0.0005782680382625174, 0.0005782680382625174], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18880 loss: 2.3567 iter time (s): 63.558 samples/sec: 16.111 %comms: 0.0028381551384056604 %optimizer_step 0.05518328643644583 %forward: 22.928697846593753 %backward: 61.42616135084742 [2025-04-07 16:56:05,291] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26721.11 | forward: 145730.24 | backward_microstep: 390426.30 | backward: 390412.47 | backward_inner_microstep: 390394.29 | backward_inner: 390386.93 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.19 | step: 350.73 | _step_clipping: 0.14 | _step_step: 349.01 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.111 | iteration 18880/ 143000 | elapsed time per iteration (ms): 63558.6 | learning rate: 5.783E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.366327E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 17:06:42,698] [INFO] [logging.py:60:log_dist] [Rank 0] step=18890, skipped=18, lr=[0.0005782434036183348, 0.0005782434036183348], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18890 loss: 2.3526 iter time (s): 63.740 samples/sec: 16.065 %comms: 0.002946528219703917 %optimizer_step 0.05713737675789628 %forward: 22.860801716566982 %backward: 61.261837554808864 [2025-04-07 17:06:42,699] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28448.89 | forward: 145714.86 | backward_microstep: 390497.78 | backward: 390483.25 | backward_inner_microstep: 390463.17 | backward_inner: 390456.22 | backward_allreduce_microstep: 8.73 | backward_allreduce: 3.09 | reduce_tied_grads: 0.31 | comms: 18.78 | reduce_grads: 0.23 | step: 364.19 | _step_clipping: 0.12 | _step_step: 362.01 | _step_zero_grad: 0.66 | _step_check_overflow: 0.70 samples/sec: 16.065 | iteration 18890/ 143000 | elapsed time per iteration (ms): 63740.7 | learning rate: 5.782E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.374306E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 17:17:22,899] [INFO] [logging.py:60:log_dist] [Rank 0] step=18900, skipped=18, lr=[0.0005782187555448779, 0.0005782187555448779], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18900 loss: 2.3687 iter time (s): 64.019 samples/sec: 15.995 %comms: 0.0028262627632070275 %optimizer_step 0.05618177626209735 %forward: 23.338498171808155 %backward: 61.00461825971445 [2025-04-07 17:17:22,899] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27448.63 | forward: 149411.84 | backward_microstep: 390565.77 | backward: 390548.37 | backward_inner_microstep: 390529.77 | backward_inner: 390522.59 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.22 | step: 359.67 | _step_clipping: 0.11 | _step_step: 357.77 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 15.995 | iteration 18900/ 143000 | elapsed time per iteration (ms): 64020.1 | learning rate: 5.782E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.369148E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 17:28:01,389] [INFO] [logging.py:60:log_dist] [Rank 0] step=18910, skipped=18, lr=[0.0005781940940433361, 0.0005781940940433361], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18910 loss: 2.3618 iter time (s): 63.848 samples/sec: 16.038 %comms: 0.002848920796976075 %optimizer_step 0.05492815189151289 %forward: 22.83619653103456 %backward: 61.14647530310259 [2025-04-07 17:28:01,389] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29516.65 | forward: 145805.49 | backward_microstep: 390424.34 | backward: 390410.54 | backward_inner_microstep: 390390.84 | backward_inner: 390384.05 | backward_allreduce_microstep: 10.36 | backward_allreduce: 4.75 | reduce_tied_grads: 0.38 | comms: 18.19 | reduce_grads: 0.21 | step: 350.71 | _step_clipping: 0.13 | _step_step: 348.94 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.038 | iteration 18910/ 143000 | elapsed time per iteration (ms): 63849.0 | learning rate: 5.782E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.365503E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 17:38:41,872] [INFO] [logging.py:60:log_dist] [Rank 0] step=18920, skipped=18, lr=[0.0005781694191149, 0.0005781694191149], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18920 loss: 2.3777 iter time (s): 64.048 samples/sec: 15.988 %comms: 0.002824417051836085 %optimizer_step 0.05633247955326178 %forward: 22.804337059933257 %backward: 60.94692710298375 [2025-04-07 17:38:41,873] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31284.80 | forward: 146056.77 | backward_microstep: 390366.81 | backward: 390351.69 | backward_inner_microstep: 390333.39 | backward_inner: 390326.44 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.16 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.22 | step: 360.80 | _step_clipping: 0.13 | _step_step: 358.91 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 15.988 | iteration 18920/ 143000 | elapsed time per iteration (ms): 64048.4 | learning rate: 5.782E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.369529E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 17:49:18,458] [INFO] [logging.py:60:log_dist] [Rank 0] step=18930, skipped=18, lr=[0.0005781447307607604, 0.0005781447307607604], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18930 loss: 2.3741 iter time (s): 63.658 samples/sec: 16.086 %comms: 0.002849988489035256 %optimizer_step 0.055110563927969515 %forward: 22.900902523096324 %backward: 61.32298254417837 [2025-04-07 17:49:18,459] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27633.23 | forward: 145782.61 | backward_microstep: 390383.94 | backward: 390369.95 | backward_inner_microstep: 390352.01 | backward_inner: 390345.06 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.89 | reduce_tied_grads: 0.33 | comms: 18.14 | reduce_grads: 0.20 | step: 350.82 | _step_clipping: 0.13 | _step_step: 348.89 | _step_zero_grad: 0.52 | _step_check_overflow: 0.69 samples/sec: 16.086 | iteration 18930/ 143000 | elapsed time per iteration (ms): 63658.6 | learning rate: 5.781E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.367783E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 18:00:04,064] [INFO] [logging.py:60:log_dist] [Rank 0] step=18940, skipped=18, lr=[0.0005781200289821088, 0.0005781200289821088], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18940 loss: 2.3476 iter time (s): 64.560 samples/sec: 15.861 %comms: 0.0028398642903605235 %optimizer_step 0.055626057365113904 %forward: 22.591768252278356 %backward: 60.492617493606424 [2025-04-07 18:00:04,065] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36391.86 | forward: 145852.35 | backward_microstep: 390557.30 | backward: 390540.05 | backward_inner_microstep: 390519.13 | backward_inner: 390511.92 | backward_allreduce_microstep: 10.87 | backward_allreduce: 5.24 | reduce_tied_grads: 0.36 | comms: 18.33 | reduce_grads: 0.22 | step: 359.12 | _step_clipping: 0.14 | _step_step: 357.16 | _step_zero_grad: 0.54 | _step_check_overflow: 0.62 samples/sec: 15.861 | iteration 18940/ 143000 | elapsed time per iteration (ms): 64560.6 | learning rate: 5.781E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.360704E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 18:10:41,915] [INFO] [logging.py:60:log_dist] [Rank 0] step=18950, skipped=18, lr=[0.0005780953137801375, 0.0005780953137801375], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18950 loss: 2.3624 iter time (s): 63.784 samples/sec: 16.054 %comms: 0.00286871184681133 %optimizer_step 0.057347137295993916 %forward: 22.85096921521276 %backward: 61.21316258950704 [2025-04-07 18:10:41,915] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28830.58 | forward: 145753.57 | backward_microstep: 390460.84 | backward: 390444.58 | backward_inner_microstep: 390426.21 | backward_inner: 390419.21 | backward_allreduce_microstep: 8.68 | backward_allreduce: 2.99 | reduce_tied_grads: 0.34 | comms: 18.30 | reduce_grads: 0.22 | step: 365.79 | _step_clipping: 0.14 | _step_step: 363.91 | _step_zero_grad: 0.54 | _step_check_overflow: 0.50 samples/sec: 16.054 | iteration 18950/ 143000 | elapsed time per iteration (ms): 63785.0 | learning rate: 5.781E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.376411E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 18:21:15,212] [INFO] [logging.py:60:log_dist] [Rank 0] step=18960, skipped=18, lr=[0.0005780705851560394, 0.0005780705851560394], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18960 loss: 2.3550 iter time (s): 63.329 samples/sec: 16.169 %comms: 0.0028756318369704123 %optimizer_step 0.059301873594256246 %forward: 22.98339358285191 %backward: 61.61802282027682 [2025-04-07 18:21:15,212] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24686.51 | forward: 145551.83 | backward_microstep: 390232.54 | backward: 390221.58 | backward_inner_microstep: 390204.04 | backward_inner: 390197.37 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.93 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.22 | step: 375.55 | _step_clipping: 0.13 | _step_step: 373.77 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.169 | iteration 18960/ 143000 | elapsed time per iteration (ms): 63329.7 | learning rate: 5.781E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.365981E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 18:31:49,459] [INFO] [logging.py:60:log_dist] [Rank 0] step=18970, skipped=18, lr=[0.0005780458431110078, 0.0005780458431110078], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18970 loss: 2.3578 iter time (s): 63.424 samples/sec: 16.145 %comms: 0.0028635789610862526 %optimizer_step 0.05565380766115811 %forward: 22.98291919037348 %backward: 61.57566481918365 [2025-04-07 18:31:49,460] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25080.52 | forward: 145767.26 | backward_microstep: 390552.41 | backward: 390538.55 | backward_inner_microstep: 390521.34 | backward_inner: 390514.49 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.20 | step: 352.98 | _step_clipping: 0.12 | _step_step: 351.09 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 16.145 | iteration 18970/ 143000 | elapsed time per iteration (ms): 63424.7 | learning rate: 5.780E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.370662E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 18:42:26,990] [INFO] [logging.py:60:log_dist] [Rank 0] step=18980, skipped=18, lr=[0.0005780210876462372, 0.0005780210876462372], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18980 loss: 2.3688 iter time (s): 63.753 samples/sec: 16.062 %comms: 0.002841724597800092 %optimizer_step 0.0564288411642567 %forward: 22.82774452595414 %backward: 61.22635781910037 [2025-04-07 18:42:26,991] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28830.32 | forward: 145532.64 | backward_microstep: 390344.47 | backward: 390333.51 | backward_inner_microstep: 390315.18 | backward_inner: 390308.45 | backward_allreduce_microstep: 8.84 | backward_allreduce: 3.06 | reduce_tied_grads: 0.39 | comms: 18.12 | reduce_grads: 0.22 | step: 359.75 | _step_clipping: 0.14 | _step_step: 357.88 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.062 | iteration 18980/ 143000 | elapsed time per iteration (ms): 63753.1 | learning rate: 5.780E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.365500E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 18:52:58,343] [INFO] [logging.py:60:log_dist] [Rank 0] step=18990, skipped=18, lr=[0.0005779963187629221, 0.0005779963187629221], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 18990 loss: 2.3754 iter time (s): 63.135 samples/sec: 16.219 %comms: 0.0029000468438662367 %optimizer_step 0.05832785536846804 %forward: 23.055959819748452 %backward: 61.86093812290042 [2025-04-07 18:52:58,344] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22331.22 | forward: 145563.08 | backward_microstep: 390570.95 | backward: 390557.09 | backward_inner_microstep: 390539.13 | backward_inner: 390532.08 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.97 | reduce_tied_grads: 0.31 | comms: 18.31 | reduce_grads: 0.21 | step: 368.25 | _step_clipping: 0.12 | _step_step: 366.18 | _step_zero_grad: 0.59 | _step_check_overflow: 0.71 samples/sec: 16.219 | iteration 18990/ 143000 | elapsed time per iteration (ms): 63135.3 | learning rate: 5.780E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.363962E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 19:03:42,955] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-07 19:03:42,956] [INFO] [logging.py:60:log_dist] [Rank 0] step=19000, skipped=19, lr=[0.000577974015296071, 0.000577974015296071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19000 loss: 2.3710 iter time (s): 64.461 samples/sec: 15.886 %comms: 0.0025469417290845606 %optimizer_step 0.052836400670899245 %forward: 22.598528733539432 %backward: 60.56251975987158 [2025-04-07 19:03:42,956] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35684.84 | forward: 145671.49 | backward_microstep: 390405.33 | backward: 390389.70 | backward_inner_microstep: 390370.83 | backward_inner: 390362.08 | backward_allreduce_microstep: 9.12 | backward_allreduce: 3.11 | reduce_tied_grads: 0.39 | comms: 16.42 | reduce_grads: 0.23 | step: 340.59 | _step_clipping: 0.16 | _step_step: 338.78 | _step_zero_grad: 0.56 | _step_check_overflow: 0.45 samples/sec: 15.886 | iteration 19000/ 143000 | elapsed time per iteration (ms): 64461.3 | learning rate: 5.780E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.374156E+00 | loss scale: 524288.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-07 19:03:45,940] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step19000/mp_rank_00_model_states.pt [2025-04-07 19:04:00,717] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-07 19:04:00,723] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step19000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-07 19:05:03,206] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-07 19:14:32,686] [INFO] [logging.py:60:log_dist] [Rank 0] step=19010, skipped=20, lr=[0.0005779517009620394, 0.0005779517009620394], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19010 loss: 2.3693 iter time (s): 63.195 samples/sec: 16.204 %comms: 0.002573091291127032 %optimizer_step 0.052421877640453744 %forward: 23.031635415324388 %backward: 61.758235677397145 [2025-04-07 19:14:32,687] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23311.33 | forward: 145548.18 | backward_microstep: 390291.11 | backward: 390280.53 | backward_inner_microstep: 390263.20 | backward_inner: 390254.48 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.83 | reduce_tied_grads: 0.31 | comms: 16.26 | reduce_grads: 0.21 | step: 331.28 | _step_clipping: 0.14 | _step_step: 329.36 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 15.760 | iteration 19010/ 143000 | elapsed time per iteration (ms): 64973.1 | learning rate: 5.780E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.376742E+00 | loss scale: 262144.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-07 19:25:10,802] [INFO] [logging.py:60:log_dist] [Rank 0] step=19020, skipped=20, lr=[0.000577926894513162, 0.000577926894513162], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19020 loss: 2.3543 iter time (s): 63.811 samples/sec: 16.047 %comms: 0.0029066027484721704 %optimizer_step 0.05638180881426968 %forward: 22.840249692708014 %backward: 61.19709321760881 [2025-04-07 19:25:10,802] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29049.91 | forward: 145745.71 | backward_microstep: 390517.97 | backward: 390504.22 | backward_inner_microstep: 390486.08 | backward_inner: 390479.19 | backward_allreduce_microstep: 8.65 | backward_allreduce: 3.10 | reduce_tied_grads: 0.34 | comms: 18.55 | reduce_grads: 0.22 | step: 359.78 | _step_clipping: 0.14 | _step_step: 357.95 | _step_zero_grad: 0.55 | _step_check_overflow: 0.49 samples/sec: 16.047 | iteration 19020/ 143000 | elapsed time per iteration (ms): 63811.5 | learning rate: 5.779E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.369369E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 19:35:45,264] [INFO] [logging.py:60:log_dist] [Rank 0] step=19030, skipped=20, lr=[0.0005779020746502863, 0.0005779020746502863], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19030 loss: 2.3636 iter time (s): 63.446 samples/sec: 16.140 %comms: 0.0028711759115988217 %optimizer_step 0.05689422678881343 %forward: 22.954550418307466 %backward: 61.57758957858406 [2025-04-07 19:35:45,265] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25316.66 | forward: 145636.71 | backward_microstep: 390702.07 | backward: 390683.22 | backward_inner_microstep: 390663.06 | backward_inner: 390655.77 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.99 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.21 | step: 360.97 | _step_clipping: 0.13 | _step_step: 358.88 | _step_zero_grad: 0.61 | _step_check_overflow: 0.69 samples/sec: 16.140 | iteration 19030/ 143000 | elapsed time per iteration (ms): 63446.3 | learning rate: 5.779E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.367787E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 19:46:19,531] [INFO] [logging.py:60:log_dist] [Rank 0] step=19040, skipped=20, lr=[0.0005778772413746105, 0.0005778772413746105], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19040 loss: 2.3573 iter time (s): 63.426 samples/sec: 16.145 %comms: 0.0028479703453708235 %optimizer_step 0.05619042784935128 %forward: 22.9524072718754 %backward: 61.57063871492659 [2025-04-07 19:46:19,532] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25434.09 | forward: 145578.01 | backward_microstep: 390532.61 | backward: 390518.12 | backward_inner_microstep: 390500.03 | backward_inner: 390492.94 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.21 | step: 356.39 | _step_clipping: 0.12 | _step_step: 354.64 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.145 | iteration 19040/ 143000 | elapsed time per iteration (ms): 63426.6 | learning rate: 5.779E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.365055E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 19:56:53,370] [INFO] [logging.py:60:log_dist] [Rank 0] step=19050, skipped=20, lr=[0.0005778523946873328, 0.0005778523946873328], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19050 loss: 2.3831 iter time (s): 63.383 samples/sec: 16.156 %comms: 0.0029075180819292798 %optimizer_step 0.056443037349868344 %forward: 22.96107386836166 %backward: 61.58977523901209 [2025-04-07 19:56:53,371] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25185.78 | forward: 145534.81 | backward_microstep: 390389.70 | backward: 390376.18 | backward_inner_microstep: 390359.15 | backward_inner: 390352.38 | backward_allreduce_microstep: 7.95 | backward_allreduce: 2.75 | reduce_tied_grads: 0.34 | comms: 18.43 | reduce_grads: 0.23 | step: 357.75 | _step_clipping: 0.12 | _step_step: 355.81 | _step_zero_grad: 0.56 | _step_check_overflow: 0.62 samples/sec: 16.156 | iteration 19050/ 143000 | elapsed time per iteration (ms): 63383.9 | learning rate: 5.779E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.368681E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 20:07:35,456] [INFO] [logging.py:60:log_dist] [Rank 0] step=19060, skipped=20, lr=[0.0005778275345896529, 0.0005778275345896529], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19060 loss: 2.3611 iter time (s): 64.208 samples/sec: 15.948 %comms: 0.0029208263335847215 %optimizer_step 0.05795085055868291 %forward: 22.675877423072237 %backward: 60.80762197511223 [2025-04-07 20:07:35,456] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33284.09 | forward: 145596.99 | backward_microstep: 390447.76 | backward: 390432.82 | backward_inner_microstep: 390414.62 | backward_inner: 390407.67 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.98 | reduce_tied_grads: 0.38 | comms: 18.75 | reduce_grads: 0.42 | step: 372.09 | _step_clipping: 0.16 | _step_step: 370.08 | _step_zero_grad: 0.59 | _step_check_overflow: 0.58 samples/sec: 15.948 | iteration 19060/ 143000 | elapsed time per iteration (ms): 64208.5 | learning rate: 5.778E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.369872E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 20:18:13,015] [INFO] [logging.py:60:log_dist] [Rank 0] step=19070, skipped=20, lr=[0.0005778026610827701, 0.0005778026610827701], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19070 loss: 2.3732 iter time (s): 63.755 samples/sec: 16.061 %comms: 0.0028508364066405483 %optimizer_step 0.057155803356415874 %forward: 22.865223635683847 %backward: 61.22410991726942 [2025-04-07 20:18:13,016] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28694.34 | forward: 145777.99 | backward_microstep: 390355.34 | backward: 390336.34 | backward_inner_microstep: 390314.76 | backward_inner: 390307.94 | backward_allreduce_microstep: 10.31 | backward_allreduce: 2.95 | reduce_tied_grads: 0.30 | comms: 18.18 | reduce_grads: 0.24 | step: 364.40 | _step_clipping: 0.13 | _step_step: 362.45 | _step_zero_grad: 0.58 | _step_check_overflow: 0.58 samples/sec: 16.061 | iteration 19070/ 143000 | elapsed time per iteration (ms): 63755.9 | learning rate: 5.778E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.365889E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 20:28:40,407] [INFO] [logging.py:60:log_dist] [Rank 0] step=19080, skipped=20, lr=[0.0005777777741678852, 0.0005777777741678852], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19080 loss: 2.3771 iter time (s): 62.739 samples/sec: 16.322 %comms: 0.002872069760015803 %optimizer_step 0.055682135141433435 %forward: 23.185897144488617 %backward: 62.20669035078217 [2025-04-07 20:28:40,407] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18955.05 | forward: 145465.05 | backward_microstep: 390288.06 | backward: 390276.01 | backward_inner_microstep: 390259.17 | backward_inner: 390252.64 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.73 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 349.34 | _step_clipping: 0.14 | _step_step: 347.48 | _step_zero_grad: 0.50 | _step_check_overflow: 0.64 samples/sec: 16.322 | iteration 19080/ 143000 | elapsed time per iteration (ms): 62739.2 | learning rate: 5.778E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.379834E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 20:39:17,709] [INFO] [logging.py:60:log_dist] [Rank 0] step=19090, skipped=20, lr=[0.0005777528738461993, 0.0005777528738461993], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19090 loss: 2.3564 iter time (s): 63.730 samples/sec: 16.068 %comms: 0.002866989076033502 %optimizer_step 0.05657674753887609 %forward: 22.853555255978215 %backward: 61.26636699847591 [2025-04-07 20:39:17,710] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28475.43 | forward: 145644.80 | backward_microstep: 390463.14 | backward: 390448.13 | backward_inner_microstep: 390430.05 | backward_inner: 390423.02 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.27 | reduce_grads: 0.23 | step: 360.56 | _step_clipping: 0.13 | _step_step: 358.60 | _step_zero_grad: 0.54 | _step_check_overflow: 0.67 samples/sec: 16.068 | iteration 19090/ 143000 | elapsed time per iteration (ms): 63730.2 | learning rate: 5.778E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.362181E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 20:50:00,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=19100, skipped=20, lr=[0.0005777279601189144, 0.0005777279601189144], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19100 loss: 2.3765 iter time (s): 64.327 samples/sec: 15.919 %comms: 0.002875031628081357 %optimizer_step 0.05613902321625103 %forward: 22.70007837398485 %backward: 60.72696618347363 [2025-04-07 20:50:00,983] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33838.75 | forward: 146022.11 | backward_microstep: 390654.13 | backward: 390636.52 | backward_inner_microstep: 390615.88 | backward_inner: 390608.56 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.11 | reduce_tied_grads: 0.36 | comms: 18.49 | reduce_grads: 0.24 | step: 361.12 | _step_clipping: 0.14 | _step_step: 359.04 | _step_zero_grad: 0.58 | _step_check_overflow: 0.71 samples/sec: 15.919 | iteration 19100/ 143000 | elapsed time per iteration (ms): 64327.4 | learning rate: 5.777E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.370444E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 21:00:42,879] [INFO] [logging.py:60:log_dist] [Rank 0] step=19110, skipped=20, lr=[0.0005777030329872327, 0.0005777030329872327], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19110 loss: 2.3820 iter time (s): 64.189 samples/sec: 15.953 %comms: 0.002845022029231106 %optimizer_step 0.05749774069496816 %forward: 22.708140143556566 %backward: 60.80820774874726 [2025-04-07 21:00:42,879] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33106.54 | forward: 145761.26 | backward_microstep: 390334.38 | backward: 390321.75 | backward_inner_microstep: 390304.22 | backward_inner: 390297.41 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.82 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.21 | step: 369.07 | _step_clipping: 0.13 | _step_step: 367.30 | _step_zero_grad: 0.53 | _step_check_overflow: 0.50 samples/sec: 15.953 | iteration 19110/ 143000 | elapsed time per iteration (ms): 64189.6 | learning rate: 5.777E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.373010E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 21:11:21,890] [INFO] [logging.py:60:log_dist] [Rank 0] step=19120, skipped=20, lr=[0.0005776780924523573, 0.0005776780924523573], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19120 loss: 2.3678 iter time (s): 63.900 samples/sec: 16.025 %comms: 0.0028777918684859995 %optimizer_step 0.05734614995495412 %forward: 22.83144671876352 %backward: 61.142899082685354 [2025-04-07 21:11:21,890] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29648.98 | forward: 145894.02 | backward_microstep: 390723.08 | backward: 390705.99 | backward_inner_microstep: 390685.90 | backward_inner: 390678.59 | backward_allreduce_microstep: 8.71 | backward_allreduce: 2.95 | reduce_tied_grads: 0.39 | comms: 18.39 | reduce_grads: 0.20 | step: 366.44 | _step_clipping: 0.12 | _step_step: 364.57 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 16.025 | iteration 19120/ 143000 | elapsed time per iteration (ms): 63901.1 | learning rate: 5.777E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.364560E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 21:21:58,534] [INFO] [logging.py:60:log_dist] [Rank 0] step=19130, skipped=20, lr=[0.000577653138515492, 0.000577653138515492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19130 loss: 2.3623 iter time (s): 63.664 samples/sec: 16.085 %comms: 0.0028805146027484994 %optimizer_step 0.056063883725325135 %forward: 22.867125776735946 %backward: 61.33505677513868 [2025-04-07 21:21:58,535] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27869.25 | forward: 145580.74 | backward_microstep: 390497.15 | backward: 390482.09 | backward_inner_microstep: 390464.71 | backward_inner: 390457.56 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.80 | reduce_tied_grads: 0.32 | comms: 18.34 | reduce_grads: 0.23 | step: 356.92 | _step_clipping: 0.13 | _step_step: 355.15 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.084 | iteration 19130/ 143000 | elapsed time per iteration (ms): 63664.4 | learning rate: 5.777E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.368852E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 21:32:38,116] [INFO] [logging.py:60:log_dist] [Rank 0] step=19140, skipped=20, lr=[0.0005776281711778413, 0.0005776281711778413], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19140 loss: 2.3722 iter time (s): 63.958 samples/sec: 16.011 %comms: 0.002853597854123043 %optimizer_step 0.05591404125561866 %forward: 22.794700287776713 %backward: 61.06817998951376 [2025-04-07 21:32:38,117] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30486.57 | forward: 145789.55 | backward_microstep: 390594.89 | backward: 390577.75 | backward_inner_microstep: 390560.21 | backward_inner: 390551.49 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.77 | reduce_tied_grads: 0.32 | comms: 18.25 | reduce_grads: 0.21 | step: 357.61 | _step_clipping: 0.14 | _step_step: 355.83 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.010 | iteration 19140/ 143000 | elapsed time per iteration (ms): 63958.2 | learning rate: 5.776E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.367306E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 21:43:28,103] [INFO] [logging.py:60:log_dist] [Rank 0] step=19150, skipped=20, lr=[0.0005776031904406101, 0.0005776031904406101], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19150 loss: 2.3733 iter time (s): 64.998 samples/sec: 15.754 %comms: 0.0028721828563261194 %optimizer_step 0.05649171605876709 %forward: 22.47487490443647 %backward: 60.08998642571726 [2025-04-07 21:43:28,104] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40583.25 | forward: 146082.48 | backward_microstep: 390589.92 | backward: 390573.68 | backward_inner_microstep: 390554.23 | backward_inner: 390546.93 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.84 | reduce_tied_grads: 0.30 | comms: 18.67 | reduce_grads: 0.20 | step: 367.19 | _step_clipping: 0.11 | _step_step: 365.17 | _step_zero_grad: 0.58 | _step_check_overflow: 0.69 samples/sec: 15.754 | iteration 19150/ 143000 | elapsed time per iteration (ms): 64998.7 | learning rate: 5.776E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.376877E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 21:54:12,172] [INFO] [logging.py:60:log_dist] [Rank 0] step=19160, skipped=20, lr=[0.0005775781963050041, 0.0005775781963050041], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19160 loss: 2.3726 iter time (s): 64.406 samples/sec: 15.899 %comms: 0.002806144893088432 %optimizer_step 0.05583600883173849 %forward: 22.604009923341255 %backward: 60.580597860221175 [2025-04-07 21:54:12,172] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35645.01 | forward: 145583.90 | backward_microstep: 390187.35 | backward: 390176.77 | backward_inner_microstep: 390157.76 | backward_inner: 390151.08 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.07 | reduce_grads: 0.22 | step: 359.62 | _step_clipping: 0.14 | _step_step: 357.76 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 15.899 | iteration 19160/ 143000 | elapsed time per iteration (ms): 64406.8 | learning rate: 5.776E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.370321E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 22:04:46,956] [INFO] [logging.py:60:log_dist] [Rank 0] step=19170, skipped=20, lr=[0.0005775531887722296, 0.0005775531887722296], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19170 loss: 2.3669 iter time (s): 63.478 samples/sec: 16.132 %comms: 0.002868254609705653 %optimizer_step 0.056454954885593725 %forward: 22.92462634066222 %backward: 61.49633640681191 [2025-04-07 22:04:46,957] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26203.29 | forward: 145520.68 | backward_microstep: 390379.88 | backward: 390365.73 | backward_inner_microstep: 390348.50 | backward_inner: 390341.70 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.75 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 358.36 | _step_clipping: 0.11 | _step_step: 356.57 | _step_zero_grad: 0.48 | _step_check_overflow: 0.62 samples/sec: 16.131 | iteration 19170/ 143000 | elapsed time per iteration (ms): 63478.4 | learning rate: 5.776E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.374063E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 22:15:25,904] [INFO] [logging.py:60:log_dist] [Rank 0] step=19180, skipped=20, lr=[0.0005775281678434937, 0.0005775281678434937], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19180 loss: 2.3655 iter time (s): 63.894 samples/sec: 16.027 %comms: 0.0029050944339969425 %optimizer_step 0.056823974005277396 %forward: 22.793723030137766 %backward: 61.10260943135292 [2025-04-07 22:15:25,904] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30200.43 | forward: 145638.45 | backward_microstep: 390423.22 | backward: 390409.64 | backward_inner_microstep: 390391.74 | backward_inner: 390384.74 | backward_allreduce_microstep: 8.52 | backward_allreduce: 3.04 | reduce_tied_grads: 0.34 | comms: 18.56 | reduce_grads: 0.21 | step: 363.07 | _step_clipping: 0.14 | _step_step: 361.13 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 16.026 | iteration 19180/ 143000 | elapsed time per iteration (ms): 63894.8 | learning rate: 5.775E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.381697E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 22:26:03,306] [INFO] [logging.py:60:log_dist] [Rank 0] step=19190, skipped=20, lr=[0.0005775031335200039, 0.0005775031335200039], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19190 loss: 2.3587 iter time (s): 63.740 samples/sec: 16.065 %comms: 0.0028917123604917344 %optimizer_step 0.05747571946399579 %forward: 22.89392147492544 %backward: 61.28862055875246 [2025-04-07 22:26:03,307] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28026.88 | forward: 145924.98 | backward_microstep: 390675.28 | backward: 390651.31 | backward_inner_microstep: 390629.71 | backward_inner: 390620.69 | backward_allreduce_microstep: 10.25 | backward_allreduce: 3.08 | reduce_tied_grads: 0.35 | comms: 18.43 | reduce_grads: 0.22 | step: 366.35 | _step_clipping: 0.13 | _step_step: 364.37 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 16.065 | iteration 19190/ 143000 | elapsed time per iteration (ms): 63740.2 | learning rate: 5.775E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.369384E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 22:36:34,852] [INFO] [logging.py:60:log_dist] [Rank 0] step=19200, skipped=20, lr=[0.0005774780858029684, 0.0005774780858029684], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19200 loss: 2.3478 iter time (s): 63.154 samples/sec: 16.214 %comms: 0.0029013102134884377 %optimizer_step 0.05710207512913502 %forward: 23.08439959695738 %backward: 61.86693522604213 [2025-04-07 22:36:34,853] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22294.65 | forward: 145787.30 | backward_microstep: 390732.98 | backward: 390714.66 | backward_inner_microstep: 390694.53 | backward_inner: 390687.28 | backward_allreduce_microstep: 10.33 | backward_allreduce: 2.96 | reduce_tied_grads: 0.35 | comms: 18.32 | reduce_grads: 0.21 | step: 360.62 | _step_clipping: 0.13 | _step_step: 358.68 | _step_zero_grad: 0.55 | _step_check_overflow: 0.63 samples/sec: 16.214 | iteration 19200/ 143000 | elapsed time per iteration (ms): 63154.7 | learning rate: 5.775E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.362099E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 22:47:09,051] [INFO] [logging.py:60:log_dist] [Rank 0] step=19210, skipped=20, lr=[0.0005774530246935965, 0.0005774530246935965], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19210 loss: 2.3770 iter time (s): 63.419 samples/sec: 16.147 %comms: 0.002912595944942384 %optimizer_step 0.05805436237492806 %forward: 22.968973351301617 %backward: 61.58706381234745 [2025-04-07 22:47:09,053] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25189.46 | forward: 145667.62 | backward_microstep: 390598.70 | backward: 390580.84 | backward_inner_microstep: 390562.70 | backward_inner: 390555.54 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.91 | reduce_tied_grads: 0.35 | comms: 18.47 | reduce_grads: 0.22 | step: 368.18 | _step_clipping: 0.12 | _step_step: 365.85 | _step_zero_grad: 0.60 | _step_check_overflow: 0.93 samples/sec: 16.146 | iteration 19210/ 143000 | elapsed time per iteration (ms): 63419.9 | learning rate: 5.775E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.362606E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 22:57:43,555] [INFO] [logging.py:60:log_dist] [Rank 0] step=19220, skipped=20, lr=[0.0005774279501930973, 0.0005774279501930973], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19220 loss: 2.3603 iter time (s): 63.450 samples/sec: 16.139 %comms: 0.0028983116313056343 %optimizer_step 0.05690587562870806 %forward: 22.958963713373226 %backward: 61.54382911323454 [2025-04-07 22:57:43,556] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25616.91 | forward: 145673.95 | backward_microstep: 390511.03 | backward: 390493.78 | backward_inner_microstep: 390471.71 | backward_inner: 390462.93 | backward_allreduce_microstep: 12.30 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 18.39 | reduce_grads: 0.21 | step: 361.07 | _step_clipping: 0.14 | _step_step: 359.23 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.139 | iteration 19220/ 143000 | elapsed time per iteration (ms): 63450.3 | learning rate: 5.774E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.367330E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 23:08:20,830] [INFO] [logging.py:60:log_dist] [Rank 0] step=19230, skipped=20, lr=[0.0005774028623026813, 0.0005774028623026813], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19230 loss: 2.3776 iter time (s): 63.727 samples/sec: 16.069 %comms: 0.0028957691142396285 %optimizer_step 0.06002698426121147 %forward: 22.86058864532053 %backward: 61.25627283811262 [2025-04-07 23:08:20,831] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28519.51 | forward: 145683.43 | backward_microstep: 390383.09 | backward: 390367.19 | backward_inner_microstep: 390349.13 | backward_inner: 390342.19 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.89 | reduce_tied_grads: 0.35 | comms: 18.45 | reduce_grads: 0.21 | step: 382.53 | _step_clipping: 0.14 | _step_step: 380.42 | _step_zero_grad: 0.62 | _step_check_overflow: 0.68 samples/sec: 16.068 | iteration 19230/ 143000 | elapsed time per iteration (ms): 63727.5 | learning rate: 5.774E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.383171E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 23:19:00,471] [INFO] [logging.py:60:log_dist] [Rank 0] step=19240, skipped=20, lr=[0.0005773777610235592, 0.0005773777610235592], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19240 loss: 2.3639 iter time (s): 63.963 samples/sec: 16.009 %comms: 0.002880177370797595 %optimizer_step 0.056387551216035635 %forward: 22.744686558806436 %backward: 61.0090728293287 [2025-04-07 23:19:00,472] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31252.32 | forward: 145482.83 | backward_microstep: 390245.55 | backward: 390234.98 | backward_inner_microstep: 390217.63 | backward_inner: 390210.99 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.84 | reduce_tied_grads: 0.34 | comms: 18.42 | reduce_grads: 0.20 | step: 360.67 | _step_clipping: 0.13 | _step_step: 358.74 | _step_zero_grad: 0.56 | _step_check_overflow: 0.64 samples/sec: 16.009 | iteration 19240/ 143000 | elapsed time per iteration (ms): 63964.0 | learning rate: 5.774E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.367389E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 23:29:53,082] [INFO] [logging.py:60:log_dist] [Rank 0] step=19250, skipped=20, lr=[0.0005773526463569425, 0.0005773526463569425], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19250 loss: 2.3910 iter time (s): 65.261 samples/sec: 15.691 %comms: 0.0027837693267216963 %optimizer_step 0.05573972178974715 %forward: 22.31621036350404 %backward: 59.82102639257803 [2025-04-07 23:29:53,083] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 43875.59 | forward: 145636.71 | backward_microstep: 390409.54 | backward: 390395.03 | backward_inner_microstep: 390376.15 | backward_inner: 390368.95 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.08 | reduce_tied_grads: 0.35 | comms: 18.17 | reduce_grads: 0.22 | step: 363.76 | _step_clipping: 0.14 | _step_step: 361.88 | _step_zero_grad: 0.56 | _step_check_overflow: 0.56 samples/sec: 15.691 | iteration 19250/ 143000 | elapsed time per iteration (ms): 65261.1 | learning rate: 5.774E-04 | approx flops per GPU: 67.7TFLOPS | lm_loss: 2.371455E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 23:40:24,912] [INFO] [logging.py:60:log_dist] [Rank 0] step=19260, skipped=20, lr=[0.0005773275183040435, 0.0005773275183040435], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19260 loss: 2.3588 iter time (s): 63.182 samples/sec: 16.207 %comms: 0.0028661602706047135 %optimizer_step 0.0567100497510179 %forward: 23.030599147425864 %backward: 61.771294986673134 [2025-04-07 23:40:24,913] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23340.76 | forward: 145512.81 | backward_microstep: 390298.51 | backward: 390285.75 | backward_inner_microstep: 390267.53 | backward_inner: 390260.62 | backward_allreduce_microstep: 8.61 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.11 | reduce_grads: 0.21 | step: 358.31 | _step_clipping: 0.14 | _step_step: 356.42 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 16.207 | iteration 19260/ 143000 | elapsed time per iteration (ms): 63183.0 | learning rate: 5.773E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.384735E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-07 23:51:13,042] [INFO] [logging.py:60:log_dist] [Rank 0] step=19270, skipped=20, lr=[0.0005773023768660748, 0.0005773023768660748], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19270 loss: 2.3784 iter time (s): 64.812 samples/sec: 15.799 %comms: 0.0027943338619613243 %optimizer_step 0.05531049900168798 %forward: 22.450650315561855 %backward: 60.2012549346679 [2025-04-07 23:51:13,043] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39745.79 | forward: 145508.08 | backward_microstep: 390190.54 | backward: 390178.86 | backward_inner_microstep: 390160.37 | backward_inner: 390153.44 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.07 | reduce_tied_grads: 0.32 | comms: 18.11 | reduce_grads: 0.22 | step: 358.48 | _step_clipping: 0.13 | _step_step: 356.69 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 15.799 | iteration 19270/ 143000 | elapsed time per iteration (ms): 64813.0 | learning rate: 5.773E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.365222E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 00:01:46,968] [INFO] [logging.py:60:log_dist] [Rank 0] step=19280, skipped=20, lr=[0.0005772772220442499, 0.0005772772220442499], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19280 loss: 2.3661 iter time (s): 63.392 samples/sec: 16.153 %comms: 0.0028771457309831857 %optimizer_step 0.05643931735454599 %forward: 22.98675781858297 %backward: 61.61346345908123 [2025-04-08 00:01:46,969] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24878.00 | forward: 145717.50 | backward_microstep: 390598.37 | backward: 390579.65 | backward_inner_microstep: 390559.75 | backward_inner: 390552.47 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.93 | reduce_tied_grads: 0.35 | comms: 18.24 | reduce_grads: 0.22 | step: 357.78 | _step_clipping: 0.14 | _step_step: 355.80 | _step_zero_grad: 0.53 | _step_check_overflow: 0.66 samples/sec: 16.153 | iteration 19280/ 143000 | elapsed time per iteration (ms): 63392.6 | learning rate: 5.773E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.367373E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 00:12:20,898] [INFO] [logging.py:60:log_dist] [Rank 0] step=19290, skipped=20, lr=[0.0005772520538397831, 0.0005772520538397831], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19290 loss: 2.3507 iter time (s): 63.392 samples/sec: 16.153 %comms: 0.0028707688712457085 %optimizer_step 0.05653391260065915 %forward: 22.937801064880514 %backward: 61.565450868230855 [2025-04-08 00:12:20,898] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25514.02 | forward: 145408.20 | backward_microstep: 390292.59 | backward: 390278.11 | backward_inner_microstep: 390255.57 | backward_inner: 390248.80 | backward_allreduce_microstep: 11.56 | backward_allreduce: 4.42 | reduce_tied_grads: 0.36 | comms: 18.20 | reduce_grads: 0.21 | step: 358.38 | _step_clipping: 0.12 | _step_step: 356.60 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.153 | iteration 19290/ 143000 | elapsed time per iteration (ms): 63393.0 | learning rate: 5.773E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.368323E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 00:22:54,408] [INFO] [logging.py:60:log_dist] [Rank 0] step=19300, skipped=20, lr=[0.0005772268722538888, 0.0005772268722538888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19300 loss: 2.3426 iter time (s): 63.350 samples/sec: 16.164 %comms: 0.0031543704219378983 %optimizer_step 0.06288124437300154 %forward: 22.97511301733061 %backward: 61.62316455468593 [2025-04-08 00:22:54,409] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24829.64 | forward: 145548.19 | backward_microstep: 390399.61 | backward: 390385.03 | backward_inner_microstep: 390366.85 | backward_inner: 390359.72 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.95 | reduce_tied_grads: 2.12 | comms: 19.98 | reduce_grads: 0.23 | step: 398.36 | _step_clipping: 0.17 | _step_step: 394.35 | _step_zero_grad: 0.66 | _step_check_overflow: 0.74 samples/sec: 16.164 | iteration 19300/ 143000 | elapsed time per iteration (ms): 63351.0 | learning rate: 5.772E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.358254E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 00:33:33,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=19310, skipped=20, lr=[0.0005772016772877826, 0.0005772016772877826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19310 loss: 2.3676 iter time (s): 63.871 samples/sec: 16.032 %comms: 0.002935209975040155 %optimizer_step 0.05809709745669881 %forward: 22.816564198748853 %backward: 61.15875532037464 [2025-04-08 00:33:33,128] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29622.23 | forward: 145732.37 | backward_microstep: 390645.07 | backward: 390628.93 | backward_inner_microstep: 390611.12 | backward_inner: 390604.00 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.87 | reduce_tied_grads: 0.35 | comms: 18.75 | reduce_grads: 0.23 | step: 371.07 | _step_clipping: 0.12 | _step_step: 369.27 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.032 | iteration 19310/ 143000 | elapsed time per iteration (ms): 63871.9 | learning rate: 5.772E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.370177E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 00:44:07,466] [INFO] [logging.py:60:log_dist] [Rank 0] step=19320, skipped=20, lr=[0.0005771764689426804, 0.0005771764689426804], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19320 loss: 2.3725 iter time (s): 63.433 samples/sec: 16.143 %comms: 0.0028495599140645946 %optimizer_step 0.05698984519605934 %forward: 22.955383950871735 %backward: 61.5353813469043 [2025-04-08 00:44:07,466] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25688.47 | forward: 145613.62 | backward_microstep: 390351.46 | backward: 390339.35 | backward_inner_microstep: 390322.03 | backward_inner: 390314.96 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.84 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.20 | step: 361.51 | _step_clipping: 0.13 | _step_step: 359.79 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.143 | iteration 19320/ 143000 | elapsed time per iteration (ms): 63433.9 | learning rate: 5.772E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.384494E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 00:54:48,680] [INFO] [logging.py:60:log_dist] [Rank 0] step=19330, skipped=20, lr=[0.0005771512472197989, 0.0005771512472197989], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19330 loss: 2.3673 iter time (s): 64.121 samples/sec: 15.970 %comms: 0.0028722506002823232 %optimizer_step 0.05780486599751064 %forward: 22.744843807369683 %backward: 60.899110592302804 [2025-04-08 00:54:48,681] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32167.45 | forward: 145841.91 | backward_microstep: 390505.59 | backward: 390490.38 | backward_inner_microstep: 390472.24 | backward_inner: 390465.22 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.90 | reduce_tied_grads: 0.32 | comms: 18.42 | reduce_grads: 0.20 | step: 370.65 | _step_clipping: 0.14 | _step_step: 368.72 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 15.970 | iteration 19330/ 143000 | elapsed time per iteration (ms): 64121.5 | learning rate: 5.772E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.375875E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 01:05:21,705] [INFO] [logging.py:60:log_dist] [Rank 0] step=19340, skipped=20, lr=[0.0005771260121203555, 0.0005771260121203555], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19340 loss: 2.3599 iter time (s): 63.302 samples/sec: 16.176 %comms: 0.0028902764914444383 %optimizer_step 0.05634203051787277 %forward: 22.970985678149113 %backward: 61.658626348014046 [2025-04-08 01:05:21,706] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24602.00 | forward: 145410.74 | backward_microstep: 390322.75 | backward: 390310.91 | backward_inner_microstep: 390293.49 | backward_inner: 390286.81 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 18.30 | reduce_grads: 0.21 | step: 356.66 | _step_clipping: 0.14 | _step_step: 354.88 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.176 | iteration 19340/ 143000 | elapsed time per iteration (ms): 63302.5 | learning rate: 5.771E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.367205E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 01:15:52,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=19350, skipped=20, lr=[0.0005771007636455679, 0.0005771007636455679], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19350 loss: 2.3419 iter time (s): 63.122 samples/sec: 16.222 %comms: 0.002915799462120758 %optimizer_step 0.057895599227085495 %forward: 23.050034549184957 %backward: 61.85700546412441 [2025-04-08 01:15:52,935] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22550.72 | forward: 145497.14 | backward_microstep: 390471.92 | backward: 390455.69 | backward_inner_microstep: 390438.01 | backward_inner: 390430.69 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.84 | reduce_tied_grads: 0.35 | comms: 18.41 | reduce_grads: 0.21 | step: 365.45 | _step_clipping: 0.13 | _step_step: 363.63 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.222 | iteration 19350/ 143000 | elapsed time per iteration (ms): 63122.9 | learning rate: 5.771E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.367903E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 01:26:20,288] [INFO] [logging.py:60:log_dist] [Rank 0] step=19360, skipped=20, lr=[0.000577075501796655, 0.000577075501796655], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19360 loss: 2.3532 iter time (s): 62.735 samples/sec: 16.323 %comms: 0.002956616001216668 %optimizer_step 0.06044597464723726 %forward: 23.18644672965851 %backward: 62.23112422181446 [2025-04-08 01:26:20,289] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18758.49 | forward: 145459.54 | backward_microstep: 390420.05 | backward: 390405.27 | backward_inner_microstep: 390387.35 | backward_inner: 390380.19 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.55 | reduce_grads: 0.21 | step: 379.21 | _step_clipping: 0.14 | _step_step: 377.13 | _step_zero_grad: 0.56 | _step_check_overflow: 0.72 samples/sec: 16.323 | iteration 19360/ 143000 | elapsed time per iteration (ms): 62735.4 | learning rate: 5.771E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.365010E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 01:37:02,078] [INFO] [logging.py:60:log_dist] [Rank 0] step=19370, skipped=20, lr=[0.000577050226574836, 0.000577050226574836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19370 loss: 2.3791 iter time (s): 64.178 samples/sec: 15.956 %comms: 0.0028940858765181083 %optimizer_step 0.055588634443306854 %forward: 22.69292623075107 %backward: 60.81849187939727 [2025-04-08 01:37:02,079] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33110.90 | forward: 145639.42 | backward_microstep: 390336.57 | backward: 390322.95 | backward_inner_microstep: 390304.92 | backward_inner: 390297.89 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.57 | reduce_grads: 0.21 | step: 356.76 | _step_clipping: 0.13 | _step_step: 354.84 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 15.955 | iteration 19370/ 143000 | elapsed time per iteration (ms): 64179.0 | learning rate: 5.771E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.363240E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 01:47:41,141] [INFO] [logging.py:60:log_dist] [Rank 0] step=19380, skipped=20, lr=[0.0005770249379813307, 0.0005770249379813307], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19380 loss: 2.3736 iter time (s): 63.906 samples/sec: 16.024 %comms: 0.0028592379200745857 %optimizer_step 0.05488383276985925 %forward: 22.765539590239996 %backward: 61.085609021620414 [2025-04-08 01:47:41,142] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30481.54 | forward: 145484.79 | backward_microstep: 390387.41 | backward: 390371.89 | backward_inner_microstep: 390348.87 | backward_inner: 390339.47 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.27 | reduce_grads: 0.20 | step: 350.74 | _step_clipping: 0.14 | _step_step: 348.95 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.023 | iteration 19380/ 143000 | elapsed time per iteration (ms): 63906.3 | learning rate: 5.770E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.368777E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 01:58:10,849] [INFO] [logging.py:60:log_dist] [Rank 0] step=19390, skipped=20, lr=[0.0005769996360173597, 0.0005769996360173597], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19390 loss: 2.3644 iter time (s): 62.970 samples/sec: 16.262 %comms: 0.0028838067373286452 %optimizer_step 0.057758257699245565 %forward: 23.086833546819378 %backward: 61.9720753011531 [2025-04-08 01:58:10,850] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21396.46 | forward: 145378.26 | backward_microstep: 390251.27 | backward: 390239.41 | backward_inner_microstep: 390219.94 | backward_inner: 390211.35 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.88 | reduce_tied_grads: 0.52 | comms: 18.16 | reduce_grads: 0.21 | step: 363.70 | _step_clipping: 0.14 | _step_step: 361.72 | _step_zero_grad: 0.55 | _step_check_overflow: 0.68 samples/sec: 16.262 | iteration 19390/ 143000 | elapsed time per iteration (ms): 62970.8 | learning rate: 5.770E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.368733E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 02:08:39,240] [INFO] [logging.py:60:log_dist] [Rank 0] step=19400, skipped=20, lr=[0.0005769743206841441, 0.0005769743206841441], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19400 loss: 2.3679 iter time (s): 62.838 samples/sec: 16.296 %comms: 0.0029130382440098472 %optimizer_step 0.05961338107253255 %forward: 23.18215807013994 %backward: 62.128548155193265 [2025-04-08 02:08:39,240] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19592.37 | forward: 145672.95 | backward_microstep: 390420.03 | backward: 390405.79 | backward_inner_microstep: 390387.96 | backward_inner: 390381.08 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.85 | reduce_tied_grads: 0.33 | comms: 18.31 | reduce_grads: 0.22 | step: 374.60 | _step_clipping: 0.13 | _step_step: 372.55 | _step_zero_grad: 0.61 | _step_check_overflow: 0.63 samples/sec: 16.296 | iteration 19400/ 143000 | elapsed time per iteration (ms): 62839.0 | learning rate: 5.770E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.376766E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 02:19:16,637] [INFO] [logging.py:60:log_dist] [Rank 0] step=19410, skipped=20, lr=[0.0005769489919829057, 0.0005769489919829057], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19410 loss: 2.3597 iter time (s): 63.739 samples/sec: 16.066 %comms: 0.002929741271736902 %optimizer_step 0.057576912033637795 %forward: 22.863991936123927 %backward: 61.26154727986945 [2025-04-08 02:19:16,637] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28402.63 | forward: 145732.95 | backward_microstep: 390491.74 | backward: 390475.37 | backward_inner_microstep: 390457.08 | backward_inner: 390450.08 | backward_allreduce_microstep: 8.62 | backward_allreduce: 3.02 | reduce_tied_grads: 0.38 | comms: 18.67 | reduce_grads: 0.25 | step: 366.99 | _step_clipping: 0.14 | _step_step: 364.95 | _step_zero_grad: 0.58 | _step_check_overflow: 0.62 samples/sec: 16.065 | iteration 19410/ 143000 | elapsed time per iteration (ms): 63739.7 | learning rate: 5.769E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.364781E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 02:29:58,144] [INFO] [logging.py:60:log_dist] [Rank 0] step=19420, skipped=20, lr=[0.0005769236499148673, 0.0005769236499148673], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19420 loss: 2.3786 iter time (s): 64.150 samples/sec: 15.963 %comms: 0.0028488975102612976 %optimizer_step 0.055220794343983956 %forward: 22.716969635708956 %backward: 60.84296175384193 [2025-04-08 02:29:58,145] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32764.96 | forward: 145729.84 | backward_microstep: 390320.52 | backward: 390308.89 | backward_inner_microstep: 390290.71 | backward_inner: 390283.71 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.94 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.21 | step: 354.24 | _step_clipping: 0.13 | _step_step: 352.36 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 15.962 | iteration 19420/ 143000 | elapsed time per iteration (ms): 64150.8 | learning rate: 5.769E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.366074E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 02:40:35,225] [INFO] [logging.py:60:log_dist] [Rank 0] step=19430, skipped=20, lr=[0.0005768982944812516, 0.0005768982944812516], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19430 loss: 2.3873 iter time (s): 63.707 samples/sec: 16.073 %comms: 0.00293722470914275 %optimizer_step 0.0573758975387402 %forward: 22.919233078529256 %backward: 61.33237949020784 [2025-04-08 02:40:35,225] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27566.60 | forward: 146012.38 | backward_microstep: 390751.36 | backward: 390732.38 | backward_inner_microstep: 390713.86 | backward_inner: 390706.61 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.94 | reduce_tied_grads: 0.40 | comms: 18.71 | reduce_grads: 0.21 | step: 365.53 | _step_clipping: 0.14 | _step_step: 363.32 | _step_zero_grad: 0.59 | _step_check_overflow: 0.79 samples/sec: 16.073 | iteration 19430/ 143000 | elapsed time per iteration (ms): 63708.0 | learning rate: 5.769E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.372166E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 02:51:13,995] [INFO] [logging.py:60:log_dist] [Rank 0] step=19440, skipped=20, lr=[0.0005768729256832826, 0.0005768729256832826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19440 loss: 2.3703 iter time (s): 63.876 samples/sec: 16.031 %comms: 0.0028936933292176676 %optimizer_step 0.05650314420605689 %forward: 22.797119480446025 %backward: 61.09878293524945 [2025-04-08 02:51:13,996] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30172.51 | forward: 145619.84 | backward_microstep: 390290.11 | backward: 390277.15 | backward_inner_microstep: 390259.83 | backward_inner: 390253.16 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.85 | reduce_tied_grads: 0.54 | comms: 18.48 | reduce_grads: 0.21 | step: 360.92 | _step_clipping: 0.13 | _step_step: 359.04 | _step_zero_grad: 0.57 | _step_check_overflow: 0.58 samples/sec: 16.031 | iteration 19440/ 143000 | elapsed time per iteration (ms): 63877.0 | learning rate: 5.769E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.366034E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 03:01:49,069] [INFO] [logging.py:60:log_dist] [Rank 0] step=19450, skipped=20, lr=[0.0005768475435221847, 0.0005768475435221847], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19450 loss: 2.3642 iter time (s): 63.507 samples/sec: 16.124 %comms: 0.0029168777221128474 %optimizer_step 0.05836150635724238 %forward: 22.998254661121777 %backward: 61.515356254650975 [2025-04-08 03:01:49,071] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25544.99 | forward: 146054.66 | backward_microstep: 390682.75 | backward: 390664.61 | backward_inner_microstep: 390646.24 | backward_inner: 390639.15 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.92 | reduce_tied_grads: 0.37 | comms: 18.52 | reduce_grads: 0.21 | step: 370.64 | _step_clipping: 0.13 | _step_step: 368.33 | _step_zero_grad: 0.59 | _step_check_overflow: 0.90 samples/sec: 16.124 | iteration 19450/ 143000 | elapsed time per iteration (ms): 63507.5 | learning rate: 5.768E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.361398E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 03:12:27,864] [INFO] [logging.py:60:log_dist] [Rank 0] step=19460, skipped=20, lr=[0.000576822147999183, 0.000576822147999183], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19460 loss: 2.3496 iter time (s): 63.879 samples/sec: 16.030 %comms: 0.0028333087419985157 %optimizer_step 0.05523851000845244 %forward: 22.812260787364607 %backward: 61.10231834603245 [2025-04-08 03:12:27,864] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30080.99 | forward: 145721.94 | backward_microstep: 390326.72 | backward: 390314.17 | backward_inner_microstep: 390296.51 | backward_inner: 390289.73 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.80 | reduce_tied_grads: 0.29 | comms: 18.10 | reduce_grads: 0.19 | step: 352.86 | _step_clipping: 0.11 | _step_step: 351.17 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 16.030 | iteration 19460/ 143000 | elapsed time per iteration (ms): 63879.4 | learning rate: 5.768E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.373726E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 03:23:04,425] [INFO] [logging.py:60:log_dist] [Rank 0] step=19470, skipped=20, lr=[0.000576796739115503, 0.000576796739115503], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19470 loss: 2.3444 iter time (s): 63.656 samples/sec: 16.087 %comms: 0.002864146612443254 %optimizer_step 0.05676969305662878 %forward: 22.887038863251814 %backward: 61.33711668460975 [2025-04-08 03:23:04,426] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27667.65 | forward: 145688.59 | backward_microstep: 390459.86 | backward: 390444.49 | backward_inner_microstep: 390426.39 | backward_inner: 390419.30 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.22 | step: 361.37 | _step_clipping: 0.13 | _step_step: 359.56 | _step_zero_grad: 0.55 | _step_check_overflow: 0.47 samples/sec: 16.086 | iteration 19470/ 143000 | elapsed time per iteration (ms): 63656.1 | learning rate: 5.768E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.363446E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 03:33:37,030] [INFO] [logging.py:60:log_dist] [Rank 0] step=19480, skipped=20, lr=[0.0005767713168723712, 0.0005767713168723712], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19480 loss: 2.3539 iter time (s): 63.260 samples/sec: 16.187 %comms: 0.002867887538887047 %optimizer_step 0.058816946007891675 %forward: 22.98939396476589 %backward: 61.66937867291848 [2025-04-08 03:33:37,031] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24351.32 | forward: 145430.64 | backward_microstep: 390129.06 | backward: 390119.78 | backward_inner_microstep: 390102.09 | backward_inner: 390095.54 | backward_allreduce_microstep: 8.69 | backward_allreduce: 2.90 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.24 | step: 372.08 | _step_clipping: 0.15 | _step_step: 369.85 | _step_zero_grad: 0.57 | _step_check_overflow: 0.85 samples/sec: 16.187 | iteration 19480/ 143000 | elapsed time per iteration (ms): 63260.5 | learning rate: 5.768E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.363543E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 03:44:19,821] [INFO] [logging.py:60:log_dist] [Rank 0] step=19490, skipped=20, lr=[0.0005767458812710146, 0.0005767458812710146], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19490 loss: 2.3589 iter time (s): 64.278 samples/sec: 15.931 %comms: 0.002817255503686054 %optimizer_step 0.056586316758562835 %forward: 22.683566288750317 %backward: 60.743399915135676 [2025-04-08 03:44:19,821] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33779.80 | forward: 145806.15 | backward_microstep: 390462.95 | backward: 390448.36 | backward_inner_microstep: 390430.69 | backward_inner: 390422.03 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.21 | step: 363.73 | _step_clipping: 0.14 | _step_step: 361.88 | _step_zero_grad: 0.58 | _step_check_overflow: 0.51 samples/sec: 15.931 | iteration 19490/ 143000 | elapsed time per iteration (ms): 64279.0 | learning rate: 5.767E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.370755E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 03:54:57,580] [INFO] [logging.py:60:log_dist] [Rank 0] step=19500, skipped=20, lr=[0.0005767204323126609, 0.0005767204323126609], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19500 loss: 2.3771 iter time (s): 63.775 samples/sec: 16.056 %comms: 0.002906576937462572 %optimizer_step 0.06008156854014165 %forward: 22.882510634442905 %backward: 61.221283119206134 [2025-04-08 03:54:57,581] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28586.17 | forward: 145934.09 | backward_microstep: 390455.71 | backward: 390441.09 | backward_inner_microstep: 390423.26 | backward_inner: 390416.16 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.38 | comms: 18.54 | reduce_grads: 0.23 | step: 383.17 | _step_clipping: 0.13 | _step_step: 381.09 | _step_zero_grad: 0.61 | _step_check_overflow: 0.66 samples/sec: 16.056 | iteration 19500/ 143000 | elapsed time per iteration (ms): 63776.0 | learning rate: 5.767E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.376200E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 04:05:41,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=19510, skipped=20, lr=[0.0005766949699985381, 0.0005766949699985381], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19510 loss: 2.3583 iter time (s): 64.342 samples/sec: 15.915 %comms: 0.002856722665486193 %optimizer_step 0.05592556908736128 %forward: 22.63352135129209 %backward: 60.66967565894217 [2025-04-08 04:05:41,004] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34702.64 | forward: 145627.95 | backward_microstep: 390371.80 | backward: 390359.06 | backward_inner_microstep: 390341.34 | backward_inner: 390334.33 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.88 | reduce_tied_grads: 0.37 | comms: 18.38 | reduce_grads: 0.23 | step: 359.83 | _step_clipping: 0.15 | _step_step: 357.81 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 15.915 | iteration 19510/ 143000 | elapsed time per iteration (ms): 64342.3 | learning rate: 5.767E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.370184E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 04:16:10,944] [INFO] [logging.py:60:log_dist] [Rank 0] step=19520, skipped=20, lr=[0.0005766694943298755, 0.0005766694943298755], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19520 loss: 2.3692 iter time (s): 62.993 samples/sec: 16.256 %comms: 0.00290143831931458 %optimizer_step 0.05884144162229894 %forward: 23.079620324533643 %backward: 61.958972035451886 [2025-04-08 04:16:10,945] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21479.46 | forward: 145386.55 | backward_microstep: 390313.99 | backward: 390301.10 | backward_inner_microstep: 390283.60 | backward_inner: 390276.73 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.84 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.19 | step: 370.66 | _step_clipping: 0.12 | _step_step: 368.87 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.256 | iteration 19520/ 143000 | elapsed time per iteration (ms): 62994.0 | learning rate: 5.767E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.363380E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 04:26:41,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=19530, skipped=20, lr=[0.0005766440053079023, 0.0005766440053079023], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19530 loss: 2.3659 iter time (s): 63.098 samples/sec: 16.229 %comms: 0.0029259713355869366 %optimizer_step 0.06074127113333543 %forward: 23.07214952632536 %backward: 61.86612802898645 [2025-04-08 04:26:41,935] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22278.23 | forward: 145581.64 | backward_microstep: 390379.29 | backward: 390365.55 | backward_inner_microstep: 390348.35 | backward_inner: 390341.67 | backward_allreduce_microstep: 8.07 | backward_allreduce: 2.77 | reduce_tied_grads: 0.38 | comms: 18.46 | reduce_grads: 0.24 | step: 383.27 | _step_clipping: 0.14 | _step_step: 381.31 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.228 | iteration 19530/ 143000 | elapsed time per iteration (ms): 63099.0 | learning rate: 5.766E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.363287E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 04:37:22,347] [INFO] [logging.py:60:log_dist] [Rank 0] step=19540, skipped=20, lr=[0.000576618502933849, 0.000576618502933849], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19540 loss: 2.3520 iter time (s): 64.040 samples/sec: 15.990 %comms: 0.0028778283721773473 %optimizer_step 0.058011813764133044 %forward: 22.75381188060612 %backward: 60.95726509043955 [2025-04-08 04:37:22,347] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31562.68 | forward: 145714.66 | backward_microstep: 390380.71 | backward: 390368.31 | backward_inner_microstep: 390350.20 | backward_inner: 390343.31 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.96 | reduce_tied_grads: 0.37 | comms: 18.43 | reduce_grads: 0.23 | step: 371.51 | _step_clipping: 0.12 | _step_step: 369.55 | _step_zero_grad: 0.61 | _step_check_overflow: 0.56 samples/sec: 15.990 | iteration 19540/ 143000 | elapsed time per iteration (ms): 64041.2 | learning rate: 5.766E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.363820E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 04:47:59,909] [INFO] [logging.py:60:log_dist] [Rank 0] step=19550, skipped=20, lr=[0.0005765929872089462, 0.0005765929872089462], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19550 loss: 2.3484 iter time (s): 63.756 samples/sec: 16.061 %comms: 0.002823562164764654 %optimizer_step 0.05674677479691531 %forward: 22.854701210829354 %backward: 61.203116596199216 [2025-04-08 04:47:59,909] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28919.33 | forward: 145711.56 | backward_microstep: 390214.39 | backward: 390204.25 | backward_inner_microstep: 390185.28 | backward_inner: 390178.72 | backward_allreduce_microstep: 9.85 | backward_allreduce: 2.79 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.21 | step: 361.79 | _step_clipping: 0.14 | _step_step: 360.02 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.061 | iteration 19550/ 143000 | elapsed time per iteration (ms): 63756.2 | learning rate: 5.766E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.370312E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 04:58:33,755] [INFO] [logging.py:60:log_dist] [Rank 0] step=19560, skipped=20, lr=[0.0005765674581344257, 0.0005765674581344257], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19560 loss: 2.3970 iter time (s): 63.384 samples/sec: 16.155 %comms: 0.0028721990294977976 %optimizer_step 0.06007905524896272 %forward: 23.027113125514738 %backward: 61.59759640747551 [2025-04-08 04:58:33,756] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24669.41 | forward: 145955.21 | backward_microstep: 390446.95 | backward: 390430.62 | backward_inner_microstep: 390412.66 | backward_inner: 390403.77 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.89 | reduce_tied_grads: 0.36 | comms: 18.21 | reduce_grads: 0.25 | step: 380.81 | _step_clipping: 0.14 | _step_step: 378.80 | _step_zero_grad: 0.60 | _step_check_overflow: 0.59 samples/sec: 16.155 | iteration 19560/ 143000 | elapsed time per iteration (ms): 63384.7 | learning rate: 5.766E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.380204E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 05:09:13,449] [INFO] [logging.py:60:log_dist] [Rank 0] step=19570, skipped=20, lr=[0.0005765419157115196, 0.0005765419157115196], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19570 loss: 2.3532 iter time (s): 63.969 samples/sec: 16.008 %comms: 0.0028424807116970295 %optimizer_step 0.05620538320920207 %forward: 22.775008109893964 %backward: 61.03265696626862 [2025-04-08 05:09:13,450] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30828.58 | forward: 145688.87 | backward_microstep: 390434.31 | backward: 390418.26 | backward_inner_microstep: 390400.86 | backward_inner: 390394.31 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.83 | reduce_tied_grads: 0.30 | comms: 18.18 | reduce_grads: 0.20 | step: 359.54 | _step_clipping: 0.12 | _step_step: 357.74 | _step_zero_grad: 0.56 | _step_check_overflow: 0.49 samples/sec: 16.008 | iteration 19570/ 143000 | elapsed time per iteration (ms): 63969.4 | learning rate: 5.765E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.372924E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 05:19:48,708] [INFO] [logging.py:60:log_dist] [Rank 0] step=19580, skipped=20, lr=[0.0005765163599414604, 0.0005765163599414604], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19580 loss: 2.3593 iter time (s): 63.525 samples/sec: 16.120 %comms: 0.0028530173943873325 %optimizer_step 0.057036027600553485 %forward: 22.87220204760154 %backward: 61.429218720306864 [2025-04-08 05:19:48,708] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26974.61 | forward: 145296.24 | backward_microstep: 390242.32 | backward: 390230.66 | backward_inner_microstep: 390213.85 | backward_inner: 390207.34 | backward_allreduce_microstep: 7.94 | backward_allreduce: 2.72 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.20 | step: 362.32 | _step_clipping: 0.14 | _step_step: 360.52 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.119 | iteration 19580/ 143000 | elapsed time per iteration (ms): 63525.9 | learning rate: 5.765E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.364356E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 05:30:26,830] [INFO] [logging.py:60:log_dist] [Rank 0] step=19590, skipped=20, lr=[0.0005764907908254818, 0.0005764907908254818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19590 loss: 2.3592 iter time (s): 63.812 samples/sec: 16.047 %comms: 0.00355612376485241 %optimizer_step 0.05815483000676452 %forward: 22.820840428647696 %backward: 61.153565016745716 [2025-04-08 05:30:26,831] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29440.96 | forward: 145623.49 | backward_microstep: 390245.23 | backward: 390230.83 | backward_inner_microstep: 390213.17 | backward_inner: 390206.41 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.89 | reduce_tied_grads: 0.35 | comms: 22.69 | reduce_grads: 0.21 | step: 371.10 | _step_clipping: 0.15 | _step_step: 369.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.65 samples/sec: 16.047 | iteration 19590/ 143000 | elapsed time per iteration (ms): 63812.2 | learning rate: 5.765E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.365327E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 05:41:03,021] [INFO] [logging.py:60:log_dist] [Rank 0] step=19600, skipped=20, lr=[0.0005764652083648179, 0.0005764652083648179], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19600 loss: 2.3676 iter time (s): 63.618 samples/sec: 16.096 %comms: 0.00285798127084123 %optimizer_step 0.0598032421649188 %forward: 22.854960559914677 %backward: 61.336663138414 [2025-04-08 05:41:03,022] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27801.80 | forward: 145399.76 | backward_microstep: 390225.42 | backward: 390214.46 | backward_inner_microstep: 390197.42 | backward_inner: 390190.94 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.79 | reduce_tied_grads: 0.33 | comms: 18.18 | reduce_grads: 0.20 | step: 380.46 | _step_clipping: 0.14 | _step_step: 378.60 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.096 | iteration 19600/ 143000 | elapsed time per iteration (ms): 63619.1 | learning rate: 5.765E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.369793E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 05:51:39,910] [INFO] [logging.py:60:log_dist] [Rank 0] step=19610, skipped=20, lr=[0.0005764396125607032, 0.0005764396125607032], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19610 loss: 2.3724 iter time (s): 63.688 samples/sec: 16.078 %comms: 0.0028358672300929475 %optimizer_step 0.05565736650670778 %forward: 22.860353626279604 %backward: 61.281344639298666 [2025-04-08 05:51:39,911] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28253.40 | forward: 145593.74 | backward_microstep: 390302.46 | backward: 390290.57 | backward_inner_microstep: 390273.71 | backward_inner: 390267.09 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.72 | reduce_tied_grads: 0.33 | comms: 18.06 | reduce_grads: 0.19 | step: 354.47 | _step_clipping: 0.14 | _step_step: 352.64 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.078 | iteration 19610/ 143000 | elapsed time per iteration (ms): 63688.9 | learning rate: 5.764E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.362493E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 06:02:16,208] [INFO] [logging.py:60:log_dist] [Rank 0] step=19620, skipped=20, lr=[0.0005764140034143733, 0.0005764140034143733], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19620 loss: 2.3817 iter time (s): 63.629 samples/sec: 16.093 %comms: 0.0028554748541673018 %optimizer_step 0.06016566115659977 %forward: 22.907249637623128 %backward: 61.34161574792864 [2025-04-08 06:02:16,209] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27444.89 | forward: 145757.05 | backward_microstep: 390324.41 | backward: 390311.94 | backward_inner_microstep: 390291.42 | backward_inner: 390283.25 | backward_allreduce_microstep: 9.90 | backward_allreduce: 2.74 | reduce_tied_grads: 0.33 | comms: 18.17 | reduce_grads: 0.23 | step: 382.83 | _step_clipping: 0.14 | _step_step: 380.95 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.093 | iteration 19620/ 143000 | elapsed time per iteration (ms): 63629.8 | learning rate: 5.764E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.366426E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 06:12:48,123] [INFO] [logging.py:60:log_dist] [Rank 0] step=19630, skipped=20, lr=[0.000576388380927064, 0.000576388380927064], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19630 loss: 2.3592 iter time (s): 63.191 samples/sec: 16.205 %comms: 0.002857624309972055 %optimizer_step 0.05688437758503292 %forward: 23.010245670715392 %backward: 61.76668035197773 [2025-04-08 06:12:48,124] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23431.01 | forward: 145403.81 | backward_microstep: 390321.23 | backward: 390309.20 | backward_inner_microstep: 390290.08 | backward_inner: 390283.47 | backward_allreduce_microstep: 10.03 | backward_allreduce: 4.72 | reduce_tied_grads: 0.33 | comms: 18.06 | reduce_grads: 0.24 | step: 359.46 | _step_clipping: 0.13 | _step_step: 357.66 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.205 | iteration 19630/ 143000 | elapsed time per iteration (ms): 63191.5 | learning rate: 5.764E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.364819E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 06:23:17,672] [INFO] [logging.py:60:log_dist] [Rank 0] step=19640, skipped=20, lr=[0.0005763627451000121, 0.0005763627451000121], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19640 loss: 2.3756 iter time (s): 62.954 samples/sec: 16.266 %comms: 0.0028612439921067052 %optimizer_step 0.05855508505422601 %forward: 23.11299793219327 %backward: 62.00442748753699 [2025-04-08 06:23:17,673] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20923.49 | forward: 145506.27 | backward_microstep: 390358.00 | backward: 390344.55 | backward_inner_microstep: 390324.90 | backward_inner: 390317.84 | backward_allreduce_microstep: 10.11 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.20 | step: 368.63 | _step_clipping: 0.10 | _step_step: 366.85 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.266 | iteration 19640/ 143000 | elapsed time per iteration (ms): 62954.9 | learning rate: 5.764E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.370629E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 06:33:53,606] [INFO] [logging.py:60:log_dist] [Rank 0] step=19650, skipped=20, lr=[0.0005763370959344549, 0.0005763370959344549], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19650 loss: 2.3722 iter time (s): 63.593 samples/sec: 16.102 %comms: 0.0028372385643129867 %optimizer_step 0.05605729102734609 %forward: 22.888544537066004 %backward: 61.360575080577064 [2025-04-08 06:33:53,607] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27392.90 | forward: 145554.73 | backward_microstep: 390220.96 | backward: 390209.26 | backward_inner_microstep: 390192.00 | backward_inner: 390185.42 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.19 | step: 356.48 | _step_clipping: 0.12 | _step_step: 354.73 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.102 | iteration 19650/ 143000 | elapsed time per iteration (ms): 63593.4 | learning rate: 5.763E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.370469E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 06:44:26,772] [INFO] [logging.py:60:log_dist] [Rank 0] step=19660, skipped=20, lr=[0.0005763114334316305, 0.0005763114334316305], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19660 loss: 2.3750 iter time (s): 63.316 samples/sec: 16.173 %comms: 0.002886170117082094 %optimizer_step 0.059353691005504375 %forward: 23.001089023790176 %backward: 61.66993581560873 [2025-04-08 06:44:26,773] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24206.55 | forward: 145633.65 | backward_microstep: 390484.90 | backward: 390469.24 | backward_inner_microstep: 390451.44 | backward_inner: 390444.55 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.27 | reduce_grads: 0.21 | step: 375.80 | _step_clipping: 0.16 | _step_step: 373.83 | _step_zero_grad: 0.54 | _step_check_overflow: 0.65 samples/sec: 16.173 | iteration 19660/ 143000 | elapsed time per iteration (ms): 63316.6 | learning rate: 5.763E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.370296E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 06:55:04,937] [INFO] [logging.py:60:log_dist] [Rank 0] step=19670, skipped=20, lr=[0.0005762857575927771, 0.0005762857575927771], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19670 loss: 2.3609 iter time (s): 63.816 samples/sec: 16.046 %comms: 0.0028258258575260386 %optimizer_step 0.055219290589115386 %forward: 22.793119324562575 %backward: 61.153233404359554 [2025-04-08 06:55:04,937] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29716.47 | forward: 145456.37 | backward_microstep: 390265.76 | backward: 390254.95 | backward_inner_microstep: 390237.18 | backward_inner: 390230.43 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.90 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.21 | step: 352.39 | _step_clipping: 0.13 | _step_step: 350.54 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.046 | iteration 19670/ 143000 | elapsed time per iteration (ms): 63816.5 | learning rate: 5.763E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.357582E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 07:05:36,652] [INFO] [logging.py:60:log_dist] [Rank 0] step=19680, skipped=20, lr=[0.0005762600684191342, 0.0005762600684191342], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19680 loss: 2.3726 iter time (s): 63.171 samples/sec: 16.210 %comms: 0.002854904553656286 %optimizer_step 0.05688715203646159 %forward: 23.04589595859141 %backward: 61.76605623808028 [2025-04-08 07:05:36,652] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23186.92 | forward: 145583.06 | backward_microstep: 390192.50 | backward: 390181.90 | backward_inner_microstep: 390164.62 | backward_inner: 390156.31 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.86 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.19 | step: 359.36 | _step_clipping: 0.11 | _step_step: 357.54 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.210 | iteration 19680/ 143000 | elapsed time per iteration (ms): 63171.5 | learning rate: 5.763E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.363103E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 07:16:14,645] [INFO] [logging.py:60:log_dist] [Rank 0] step=19690, skipped=20, lr=[0.0005762343659119415, 0.0005762343659119415], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19690 loss: 2.3863 iter time (s): 63.799 samples/sec: 16.050 %comms: 0.002831482281501289 %optimizer_step 0.05507981940815874 %forward: 22.82255861291552 %backward: 61.1818263235086 [2025-04-08 07:16:14,645] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29308.06 | forward: 145605.04 | backward_microstep: 390344.71 | backward: 390332.32 | backward_inner_microstep: 390315.45 | backward_inner: 390308.82 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.19 | step: 351.40 | _step_clipping: 0.11 | _step_step: 349.74 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.050 | iteration 19690/ 143000 | elapsed time per iteration (ms): 63799.3 | learning rate: 5.762E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.392010E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 07:26:55,355] [INFO] [logging.py:60:log_dist] [Rank 0] step=19700, skipped=20, lr=[0.0005762086500724397, 0.0005762086500724397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19700 loss: 2.3831 iter time (s): 64.070 samples/sec: 15.982 %comms: 0.0028582174337308955 %optimizer_step 0.057671846094988204 %forward: 22.75483472415089 %backward: 60.93846457954166 [2025-04-08 07:26:55,355] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31645.89 | forward: 145790.97 | backward_microstep: 390449.79 | backward: 390434.74 | backward_inner_microstep: 390416.85 | backward_inner: 390409.71 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.89 | reduce_tied_grads: 0.38 | comms: 18.31 | reduce_grads: 0.22 | step: 369.51 | _step_clipping: 0.13 | _step_step: 367.53 | _step_zero_grad: 0.56 | _step_check_overflow: 0.65 samples/sec: 15.982 | iteration 19700/ 143000 | elapsed time per iteration (ms): 64071.0 | learning rate: 5.762E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.368626E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 07:37:39,746] [INFO] [logging.py:60:log_dist] [Rank 0] step=19710, skipped=20, lr=[0.0005761829209018699, 0.0005761829209018699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19710 loss: 2.3610 iter time (s): 64.439 samples/sec: 15.891 %comms: 0.002812285556011476 %optimizer_step 0.0558034575811617 %forward: 22.658577250910547 %backward: 60.59530819839591 [2025-04-08 07:37:39,747] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35073.31 | forward: 146008.57 | backward_microstep: 390482.85 | backward: 390467.32 | backward_inner_microstep: 390449.31 | backward_inner: 390442.26 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.12 | reduce_grads: 0.24 | step: 359.59 | _step_clipping: 0.14 | _step_step: 357.49 | _step_zero_grad: 0.57 | _step_check_overflow: 0.73 samples/sec: 15.891 | iteration 19710/ 143000 | elapsed time per iteration (ms): 64439.1 | learning rate: 5.762E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.360707E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 07:48:20,035] [INFO] [logging.py:60:log_dist] [Rank 0] step=19720, skipped=20, lr=[0.0005761571784014739, 0.0005761571784014739], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19720 loss: 2.3795 iter time (s): 64.028 samples/sec: 15.993 %comms: 0.002831832091387786 %optimizer_step 0.05653945469089153 %forward: 22.773164528843886 %backward: 60.980415920216544 [2025-04-08 07:48:20,036] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31198.66 | forward: 145812.65 | backward_microstep: 390462.04 | backward: 390447.10 | backward_inner_microstep: 390427.94 | backward_inner: 390421.16 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.13 | reduce_grads: 0.21 | step: 362.01 | _step_clipping: 0.14 | _step_step: 360.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 15.993 | iteration 19720/ 143000 | elapsed time per iteration (ms): 64028.9 | learning rate: 5.762E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.364311E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 07:58:54,640] [INFO] [logging.py:60:log_dist] [Rank 0] step=19730, skipped=20, lr=[0.0005761314225724942, 0.0005761314225724942], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19730 loss: 2.3618 iter time (s): 63.460 samples/sec: 16.136 %comms: 0.002888112704461655 %optimizer_step 0.057341470949639615 %forward: 22.957725256205705 %backward: 61.526180913172645 [2025-04-08 07:58:54,641] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25641.63 | forward: 145689.63 | backward_microstep: 390460.17 | backward: 390444.89 | backward_inner_microstep: 390426.64 | backward_inner: 390419.62 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.33 | reduce_grads: 0.21 | step: 363.89 | _step_clipping: 0.13 | _step_step: 361.88 | _step_zero_grad: 0.54 | _step_check_overflow: 0.71 samples/sec: 16.136 | iteration 19730/ 143000 | elapsed time per iteration (ms): 63460.6 | learning rate: 5.761E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.367071E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 08:09:32,016] [INFO] [logging.py:60:log_dist] [Rank 0] step=19740, skipped=20, lr=[0.0005761056534161737, 0.0005761056534161737], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19740 loss: 2.3783 iter time (s): 63.737 samples/sec: 16.066 %comms: 0.00287017684468063 %optimizer_step 0.05864023900965721 %forward: 22.86183680182092 %backward: 61.24661451112432 [2025-04-08 08:09:32,017] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28438.53 | forward: 145714.27 | backward_microstep: 390380.07 | backward: 390366.97 | backward_inner_microstep: 390348.64 | backward_inner: 390341.84 | backward_allreduce_microstep: 8.80 | backward_allreduce: 3.04 | reduce_tied_grads: 0.37 | comms: 18.29 | reduce_grads: 0.25 | step: 373.75 | _step_clipping: 0.16 | _step_step: 371.65 | _step_zero_grad: 0.57 | _step_check_overflow: 0.69 samples/sec: 16.066 | iteration 19740/ 143000 | elapsed time per iteration (ms): 63737.5 | learning rate: 5.761E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.365422E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 08:20:09,644] [INFO] [logging.py:60:log_dist] [Rank 0] step=19750, skipped=20, lr=[0.0005760798709337564, 0.0005760798709337564], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19750 loss: 2.3775 iter time (s): 63.762 samples/sec: 16.060 %comms: 0.0029004843602852687 %optimizer_step 0.0583683122604881 %forward: 22.848233209549882 %backward: 61.25762581114805 [2025-04-08 08:20:09,645] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28491.33 | forward: 145685.38 | backward_microstep: 390608.43 | backward: 390592.16 | backward_inner_microstep: 390573.92 | backward_inner: 390565.06 | backward_allreduce_microstep: 8.65 | backward_allreduce: 3.08 | reduce_tied_grads: 0.37 | comms: 18.49 | reduce_grads: 0.22 | step: 372.17 | _step_clipping: 0.15 | _step_step: 370.05 | _step_zero_grad: 0.57 | _step_check_overflow: 0.74 samples/sec: 16.060 | iteration 19750/ 143000 | elapsed time per iteration (ms): 63762.9 | learning rate: 5.761E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.374897E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 08:30:57,968] [INFO] [logging.py:60:log_dist] [Rank 0] step=19760, skipped=20, lr=[0.0005760540751264865, 0.0005760540751264865], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19760 loss: 2.3527 iter time (s): 64.832 samples/sec: 15.795 %comms: 0.0027915538262884357 %optimizer_step 0.05568975018298424 %forward: 22.482845490960656 %backward: 60.203918923146105 [2025-04-08 08:30:57,970] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39426.64 | forward: 145760.09 | backward_microstep: 390327.35 | backward: 390312.20 | backward_inner_microstep: 390293.92 | backward_inner: 390286.97 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.96 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.21 | step: 361.05 | _step_clipping: 0.13 | _step_step: 359.13 | _step_zero_grad: 0.52 | _step_check_overflow: 0.66 samples/sec: 15.795 | iteration 19760/ 143000 | elapsed time per iteration (ms): 64832.5 | learning rate: 5.761E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.362277E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 08:41:32,875] [INFO] [logging.py:60:log_dist] [Rank 0] step=19770, skipped=20, lr=[0.0005760282659956091, 0.0005760282659956091], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19770 loss: 2.3430 iter time (s): 63.490 samples/sec: 16.129 %comms: 0.00293053685726224 %optimizer_step 0.057119764229385556 %forward: 22.928043045088053 %backward: 61.476330348138106 [2025-04-08 08:41:32,875] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26213.71 | forward: 145569.90 | backward_microstep: 390325.48 | backward: 390312.57 | backward_inner_microstep: 390295.39 | backward_inner: 390288.71 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 18.61 | reduce_grads: 0.20 | step: 362.65 | _step_clipping: 0.14 | _step_step: 360.67 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 16.128 | iteration 19770/ 143000 | elapsed time per iteration (ms): 63490.5 | learning rate: 5.760E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.367731E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 08:52:14,945] [INFO] [logging.py:60:log_dist] [Rank 0] step=19780, skipped=20, lr=[0.0005760024435423698, 0.0005760024435423698], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19780 loss: 2.3549 iter time (s): 64.206 samples/sec: 15.949 %comms: 0.0028238609990061734 %optimizer_step 0.05754342273453145 %forward: 22.73010414992157 %backward: 60.78677438033867 [2025-04-08 08:52:14,946] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33022.34 | forward: 145942.00 | backward_microstep: 390306.67 | backward: 390290.49 | backward_inner_microstep: 390272.22 | backward_inner: 390265.27 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.21 | step: 369.47 | _step_clipping: 0.12 | _step_step: 367.73 | _step_zero_grad: 0.51 | _step_check_overflow: 0.48 samples/sec: 15.948 | iteration 19780/ 143000 | elapsed time per iteration (ms): 64207.1 | learning rate: 5.760E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.363662E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 09:02:53,515] [INFO] [logging.py:60:log_dist] [Rank 0] step=19790, skipped=20, lr=[0.0005759766077680149, 0.0005759766077680149], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19790 loss: 2.3896 iter time (s): 63.856 samples/sec: 16.036 %comms: 0.003459098430017951 %optimizer_step 0.06050536123517313 %forward: 22.854798922480573 %backward: 61.157795415724046 [2025-04-08 09:02:53,515] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29217.81 | forward: 145942.32 | backward_microstep: 390546.46 | backward: 390531.14 | backward_inner_microstep: 390512.43 | backward_inner: 390505.25 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.95 | reduce_tied_grads: 0.37 | comms: 22.09 | reduce_grads: 0.21 | step: 386.36 | _step_clipping: 0.15 | _step_step: 384.48 | _step_zero_grad: 0.59 | _step_check_overflow: 0.51 samples/sec: 16.036 | iteration 19790/ 143000 | elapsed time per iteration (ms): 63856.9 | learning rate: 5.760E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.368123E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 09:13:33,111] [INFO] [logging.py:60:log_dist] [Rank 0] step=19800, skipped=20, lr=[0.0005759507586737914, 0.0005759507586737914], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19800 loss: 2.3710 iter time (s): 63.959 samples/sec: 16.010 %comms: 0.0029226491360562074 %optimizer_step 0.05896833428872245 %forward: 22.81350200560143 %backward: 61.09890670377312 [2025-04-08 09:13:33,112] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30004.53 | forward: 145912.87 | backward_microstep: 390801.21 | backward: 390782.47 | backward_inner_microstep: 390764.27 | backward_inner: 390757.17 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.97 | reduce_tied_grads: 0.36 | comms: 18.69 | reduce_grads: 0.23 | step: 377.16 | _step_clipping: 0.14 | _step_step: 374.95 | _step_zero_grad: 0.59 | _step_check_overflow: 0.55 samples/sec: 16.010 | iteration 19800/ 143000 | elapsed time per iteration (ms): 63959.7 | learning rate: 5.760E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.377593E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 09:24:11,485] [INFO] [logging.py:60:log_dist] [Rank 0] step=19810, skipped=20, lr=[0.000575924896260947, 0.000575924896260947], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19810 loss: 2.3850 iter time (s): 63.837 samples/sec: 16.041 %comms: 0.0028476870235166003 %optimizer_step 0.05766832328435674 %forward: 22.89719690480402 %backward: 61.27590576939363 [2025-04-08 09:24:11,486] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28085.44 | forward: 146168.21 | backward_microstep: 391182.91 | backward: 391165.32 | backward_inner_microstep: 391146.54 | backward_inner: 391139.28 | backward_allreduce_microstep: 8.86 | backward_allreduce: 3.06 | reduce_tied_grads: 0.35 | comms: 18.18 | reduce_grads: 0.22 | step: 368.14 | _step_clipping: 0.13 | _step_step: 366.15 | _step_zero_grad: 0.61 | _step_check_overflow: 0.60 samples/sec: 16.041 | iteration 19810/ 143000 | elapsed time per iteration (ms): 63837.4 | learning rate: 5.759E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.373365E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 09:34:43,743] [INFO] [logging.py:60:log_dist] [Rank 0] step=19820, skipped=20, lr=[0.0005758990205307297, 0.0005758990205307297], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19820 loss: 2.3719 iter time (s): 63.225 samples/sec: 16.196 %comms: 0.0028862436660165866 %optimizer_step 0.05768260102582038 %forward: 23.142219604792473 %backward: 61.87117655510066 [2025-04-08 09:34:43,744] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21723.04 | forward: 146317.04 | backward_microstep: 391238.38 | backward: 391181.46 | backward_inner_microstep: 391162.67 | backward_inner: 391155.25 | backward_allreduce_microstep: 8.68 | backward_allreduce: 3.02 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.20 | step: 364.70 | _step_clipping: 0.13 | _step_step: 362.64 | _step_zero_grad: 0.55 | _step_check_overflow: 0.74 samples/sec: 16.196 | iteration 19820/ 143000 | elapsed time per iteration (ms): 63225.8 | learning rate: 5.759E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.373242E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 09:45:16,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=19830, skipped=20, lr=[0.0005758731314843886, 0.0005758731314843886], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19830 loss: 2.3506 iter time (s): 63.320 samples/sec: 16.172 %comms: 0.002965304005153103 %optimizer_step 0.05964137776623966 %forward: 23.027559846209623 %backward: 61.69378340989176 [2025-04-08 09:45:16,954] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23832.63 | forward: 145811.38 | backward_microstep: 390662.90 | backward: 390647.36 | backward_inner_microstep: 390629.12 | backward_inner: 390621.86 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.98 | reduce_tied_grads: 0.37 | comms: 18.78 | reduce_grads: 0.22 | step: 377.65 | _step_clipping: 0.13 | _step_step: 375.72 | _step_zero_grad: 0.56 | _step_check_overflow: 0.58 samples/sec: 16.172 | iteration 19830/ 143000 | elapsed time per iteration (ms): 63321.0 | learning rate: 5.759E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.362850E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 09:55:49,602] [INFO] [logging.py:60:log_dist] [Rank 0] step=19840, skipped=20, lr=[0.000575847229123173, 0.000575847229123173], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19840 loss: 2.3571 iter time (s): 63.264 samples/sec: 16.186 %comms: 0.0028575895833145336 %optimizer_step 0.05604306529880308 %forward: 23.01371468701573 %backward: 61.69347249768726 [2025-04-08 09:55:49,602] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23918.63 | forward: 145594.55 | backward_microstep: 390311.81 | backward: 390299.15 | backward_inner_microstep: 390281.05 | backward_inner: 390274.35 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.96 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.20 | step: 354.55 | _step_clipping: 0.14 | _step_step: 352.64 | _step_zero_grad: 0.67 | _step_check_overflow: 0.52 samples/sec: 16.186 | iteration 19840/ 143000 | elapsed time per iteration (ms): 63264.8 | learning rate: 5.758E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.356220E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 10:06:33,489] [INFO] [logging.py:60:log_dist] [Rank 0] step=19850, skipped=20, lr=[0.0005758213134483333, 0.0005758213134483333], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19850 loss: 2.3926 iter time (s): 64.388 samples/sec: 15.904 %comms: 0.0028853594627311456 %optimizer_step 0.05755578036580821 %forward: 22.633344488318727 %backward: 60.657059383009795 [2025-04-08 10:06:33,490] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34725.33 | forward: 145731.88 | backward_microstep: 390575.67 | backward: 390559.48 | backward_inner_microstep: 390537.62 | backward_inner: 390528.49 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.93 | reduce_tied_grads: 0.37 | comms: 18.58 | reduce_grads: 0.22 | step: 370.59 | _step_clipping: 0.14 | _step_step: 368.61 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 15.903 | iteration 19850/ 143000 | elapsed time per iteration (ms): 64388.8 | learning rate: 5.758E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.357430E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 10:17:15,458] [INFO] [logging.py:60:log_dist] [Rank 0] step=19860, skipped=20, lr=[0.0005757953844611201, 0.0005757953844611201], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19860 loss: 2.3693 iter time (s): 64.196 samples/sec: 15.951 %comms: 0.002854278645413786 %optimizer_step 0.056012479416972905 %forward: 22.65894743408823 %backward: 60.80848530431871 [2025-04-08 10:17:15,459] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33315.44 | forward: 145462.13 | backward_microstep: 390382.62 | backward: 390368.16 | backward_inner_microstep: 390350.10 | backward_inner: 390343.06 | backward_allreduce_microstep: 8.58 | backward_allreduce: 3.02 | reduce_tied_grads: 0.33 | comms: 18.32 | reduce_grads: 0.22 | step: 359.58 | _step_clipping: 0.14 | _step_step: 357.73 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 15.951 | iteration 19860/ 143000 | elapsed time per iteration (ms): 64196.9 | learning rate: 5.758E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.366368E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 10:27:50,108] [INFO] [logging.py:60:log_dist] [Rank 0] step=19870, skipped=20, lr=[0.000575769442162785, 0.000575769442162785], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19870 loss: 2.3640 iter time (s): 63.464 samples/sec: 16.135 %comms: 0.002860525094469373 %optimizer_step 0.05657475033741087 %forward: 22.935191952250452 %backward: 61.49925597514635 [2025-04-08 10:27:50,109] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25987.98 | forward: 145556.77 | backward_microstep: 390314.59 | backward: 390301.20 | backward_inner_microstep: 390281.75 | backward_inner: 390275.11 | backward_allreduce_microstep: 10.18 | backward_allreduce: 2.96 | reduce_tied_grads: 0.33 | comms: 18.15 | reduce_grads: 0.23 | step: 359.05 | _step_clipping: 0.13 | _step_step: 357.06 | _step_zero_grad: 0.51 | _step_check_overflow: 0.71 samples/sec: 16.135 | iteration 19870/ 143000 | elapsed time per iteration (ms): 63465.0 | learning rate: 5.758E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.368537E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 10:38:21,933] [INFO] [logging.py:60:log_dist] [Rank 0] step=19880, skipped=20, lr=[0.00057574348655458, 0.00057574348655458], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19880 loss: 2.3689 iter time (s): 63.182 samples/sec: 16.207 %comms: 0.0028491646301492296 %optimizer_step 0.05579500861533626 %forward: 23.033694892075847 %backward: 61.750312193504975 [2025-04-08 10:38:21,933] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23366.53 | forward: 145531.20 | backward_microstep: 390161.13 | backward: 390150.05 | backward_inner_microstep: 390132.59 | backward_inner: 390126.16 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.22 | step: 352.52 | _step_clipping: 0.13 | _step_step: 350.78 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.207 | iteration 19880/ 143000 | elapsed time per iteration (ms): 63182.5 | learning rate: 5.757E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.367749E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 10:48:57,244] [INFO] [logging.py:60:log_dist] [Rank 0] step=19890, skipped=20, lr=[0.0005757175176377579, 0.0005757175176377579], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19890 loss: 2.3788 iter time (s): 63.530 samples/sec: 16.118 %comms: 0.0029264075680392873 %optimizer_step 0.05699364317835534 %forward: 22.91708563630028 %backward: 61.432019570958765 [2025-04-08 10:48:57,244] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26611.64 | forward: 145591.75 | backward_microstep: 390288.03 | backward: 390276.29 | backward_inner_microstep: 390258.57 | backward_inner: 390251.83 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.91 | reduce_tied_grads: 0.37 | comms: 18.59 | reduce_grads: 0.22 | step: 362.08 | _step_clipping: 0.14 | _step_step: 360.16 | _step_zero_grad: 0.54 | _step_check_overflow: 0.61 samples/sec: 16.118 | iteration 19890/ 143000 | elapsed time per iteration (ms): 63531.1 | learning rate: 5.757E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.362351E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 10:59:31,084] [INFO] [logging.py:60:log_dist] [Rank 0] step=19900, skipped=20, lr=[0.000575691535413572, 0.000575691535413572], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19900 loss: 2.3866 iter time (s): 63.383 samples/sec: 16.156 %comms: 0.0028804652102415113 %optimizer_step 0.055900893468985934 %forward: 22.959770510778483 %backward: 61.57777762689035 [2025-04-08 10:59:31,085] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25156.67 | forward: 145526.92 | backward_microstep: 390314.24 | backward: 390301.14 | backward_inner_microstep: 390282.86 | backward_inner: 390275.92 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.11 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.22 | step: 354.32 | _step_clipping: 0.13 | _step_step: 352.40 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.155 | iteration 19900/ 143000 | elapsed time per iteration (ms): 63384.0 | learning rate: 5.757E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.376620E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 11:10:06,572] [INFO] [logging.py:60:log_dist] [Rank 0] step=19910, skipped=20, lr=[0.0005756655398832764, 0.0005756655398832764], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19910 loss: 2.3701 iter time (s): 63.548 samples/sec: 16.114 %comms: 0.002929759915456816 %optimizer_step 0.05652632033784741 %forward: 22.909635312433704 %backward: 61.432529345395814 [2025-04-08 11:10:06,573] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26638.65 | forward: 145586.69 | backward_microstep: 390410.38 | backward: 390392.87 | backward_inner_microstep: 390372.95 | backward_inner: 390365.83 | backward_allreduce_microstep: 10.32 | backward_allreduce: 2.97 | reduce_tied_grads: 0.38 | comms: 18.62 | reduce_grads: 0.21 | step: 359.21 | _step_clipping: 0.14 | _step_step: 357.20 | _step_zero_grad: 0.52 | _step_check_overflow: 0.69 samples/sec: 16.114 | iteration 19910/ 143000 | elapsed time per iteration (ms): 63548.9 | learning rate: 5.757E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.365884E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 11:20:42,931] [INFO] [logging.py:60:log_dist] [Rank 0] step=19920, skipped=20, lr=[0.0005756395310481257, 0.0005756395310481257], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19920 loss: 2.3819 iter time (s): 63.635 samples/sec: 16.092 %comms: 0.0036405401020291484 %optimizer_step 0.05620480926887145 %forward: 22.888999356804447 %backward: 61.36642508354071 [2025-04-08 11:20:42,932] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27291.40 | forward: 145654.65 | backward_microstep: 390527.50 | backward: 390506.59 | backward_inner_microstep: 390488.78 | backward_inner: 390481.83 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.89 | reduce_tied_grads: 0.32 | comms: 23.17 | reduce_grads: 0.21 | step: 357.66 | _step_clipping: 0.12 | _step_step: 355.85 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.092 | iteration 19920/ 143000 | elapsed time per iteration (ms): 63635.8 | learning rate: 5.756E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.362868E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 11:31:23,064] [INFO] [logging.py:60:log_dist] [Rank 0] step=19930, skipped=20, lr=[0.0005756135089093752, 0.0005756135089093752], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19930 loss: 2.3561 iter time (s): 64.013 samples/sec: 15.997 %comms: 0.002875466599793362 %optimizer_step 0.055321268885132405 %forward: 22.773195523623414 %backward: 61.04661347494071 [2025-04-08 11:31:23,064] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30693.36 | forward: 145777.31 | backward_microstep: 390794.58 | backward: 390775.67 | backward_inner_microstep: 390756.89 | backward_inner: 390749.68 | backward_allreduce_microstep: 9.01 | backward_allreduce: 2.99 | reduce_tied_grads: 0.35 | comms: 18.41 | reduce_grads: 0.20 | step: 354.13 | _step_clipping: 0.14 | _step_step: 352.24 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 15.997 | iteration 19930/ 143000 | elapsed time per iteration (ms): 64013.3 | learning rate: 5.756E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.361426E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 11:41:55,509] [INFO] [logging.py:60:log_dist] [Rank 0] step=19940, skipped=20, lr=[0.000575587473468281, 0.000575587473468281], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19940 loss: 2.3775 iter time (s): 63.244 samples/sec: 16.191 %comms: 0.002854096055224194 %optimizer_step 0.05713873235222487 %forward: 23.02067648792073 %backward: 61.71451656775222 [2025-04-08 11:41:55,510] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23695.84 | forward: 145591.86 | backward_microstep: 390321.54 | backward: 390307.01 | backward_inner_microstep: 390287.32 | backward_inner: 390280.49 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.20 | step: 361.37 | _step_clipping: 0.13 | _step_step: 359.51 | _step_zero_grad: 0.51 | _step_check_overflow: 0.60 samples/sec: 16.191 | iteration 19940/ 143000 | elapsed time per iteration (ms): 63244.5 | learning rate: 5.756E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.356413E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 11:52:31,764] [INFO] [logging.py:60:log_dist] [Rank 0] step=19950, skipped=20, lr=[0.0005755614247260994, 0.0005755614247260994], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19950 loss: 2.3578 iter time (s): 63.625 samples/sec: 16.094 %comms: 0.0028846369407000837 %optimizer_step 0.05770285640264793 %forward: 22.895135040613564 %backward: 61.3633300885577 [2025-04-08 11:52:31,765] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27280.30 | forward: 145669.98 | backward_microstep: 390439.52 | backward: 390423.33 | backward_inner_microstep: 390403.56 | backward_inner: 390396.62 | backward_allreduce_microstep: 8.59 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.35 | reduce_grads: 0.22 | step: 367.13 | _step_clipping: 0.13 | _step_step: 365.20 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.094 | iteration 19950/ 143000 | elapsed time per iteration (ms): 63625.5 | learning rate: 5.756E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.364821E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 12:03:16,090] [INFO] [logging.py:60:log_dist] [Rank 0] step=19960, skipped=20, lr=[0.0005755353626840878, 0.0005755353626840878], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19960 loss: 2.3640 iter time (s): 64.432 samples/sec: 15.893 %comms: 0.002870740376661119 %optimizer_step 0.05625838549365841 %forward: 22.641167460045747 %backward: 60.64143261120483 [2025-04-08 12:03:16,091] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34807.29 | forward: 145881.55 | backward_microstep: 390745.26 | backward: 390724.82 | backward_inner_microstep: 390705.59 | backward_inner: 390696.39 | backward_allreduce_microstep: 9.09 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.50 | reduce_grads: 0.21 | step: 362.48 | _step_clipping: 0.15 | _step_step: 360.56 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 15.893 | iteration 19960/ 143000 | elapsed time per iteration (ms): 64432.6 | learning rate: 5.755E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.364233E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 12:13:53,545] [INFO] [logging.py:60:log_dist] [Rank 0] step=19970, skipped=20, lr=[0.0005755092873435041, 0.0005755092873435041], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19970 loss: 2.3819 iter time (s): 63.745 samples/sec: 16.064 %comms: 0.002840084290503122 %optimizer_step 0.055654229519753164 %forward: 22.835520227846946 %backward: 61.241716366742295 [2025-04-08 12:13:53,546] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28658.49 | forward: 145564.69 | backward_microstep: 390398.61 | backward: 390384.44 | backward_inner_microstep: 390366.23 | backward_inner: 390359.42 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.96 | reduce_tied_grads: 0.34 | comms: 18.10 | reduce_grads: 0.20 | step: 354.77 | _step_clipping: 0.13 | _step_step: 352.90 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.064 | iteration 19970/ 143000 | elapsed time per iteration (ms): 63745.4 | learning rate: 5.755E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.362216E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 12:24:37,209] [INFO] [logging.py:60:log_dist] [Rank 0] step=19980, skipped=20, lr=[0.0005754831987056069, 0.0005754831987056069], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19980 loss: 2.3520 iter time (s): 64.366 samples/sec: 15.909 %comms: 0.0029247364900913866 %optimizer_step 0.06134379111050526 %forward: 22.716313638969726 %backward: 60.73596279353808 [2025-04-08 12:24:37,210] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33513.83 | forward: 146215.31 | backward_microstep: 390953.11 | backward: 390931.72 | backward_inner_microstep: 390908.55 | backward_inner: 390900.97 | backward_allreduce_microstep: 9.04 | backward_allreduce: 3.10 | reduce_tied_grads: 0.36 | comms: 18.83 | reduce_grads: 0.24 | step: 394.84 | _step_clipping: 0.15 | _step_step: 392.72 | _step_zero_grad: 0.60 | _step_check_overflow: 0.67 samples/sec: 15.909 | iteration 19980/ 143000 | elapsed time per iteration (ms): 64366.4 | learning rate: 5.755E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.362401E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 12:35:14,488] [INFO] [logging.py:60:log_dist] [Rank 0] step=19990, skipped=20, lr=[0.0005754570967716551, 0.0005754570967716551], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 19990 loss: 2.3538 iter time (s): 63.727 samples/sec: 16.068 %comms: 0.0028322990512724133 %optimizer_step 0.0552967060068347 %forward: 22.86215474936763 %backward: 61.30367659391132 [2025-04-08 12:35:14,489] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28010.60 | forward: 145694.35 | backward_microstep: 390689.61 | backward: 390671.81 | backward_inner_microstep: 390652.97 | backward_inner: 390645.51 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.04 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.22 | step: 352.39 | _step_clipping: 0.14 | _step_step: 350.66 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.068 | iteration 19990/ 143000 | elapsed time per iteration (ms): 63727.9 | learning rate: 5.755E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.358720E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 12:45:49,563] [INFO] [logging.py:60:log_dist] [Rank 0] step=20000, skipped=20, lr=[0.0005754309815429086, 0.0005754309815429086], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20000 loss: 2.3666 iter time (s): 63.507 samples/sec: 16.124 %comms: 0.002827072818169686 %optimizer_step 0.05502843330063644 %forward: 22.940399401867918 %backward: 61.46851505400809 [2025-04-08 12:45:49,564] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26206.83 | forward: 145687.44 | backward_microstep: 390380.40 | backward: 390367.69 | backward_inner_microstep: 390350.31 | backward_inner: 390343.59 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.18 | step: 349.47 | _step_clipping: 0.13 | _step_step: 347.72 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.124 | iteration 20000/ 143000 | elapsed time per iteration (ms): 63507.5 | learning rate: 5.754E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.355021E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 12:45:52,520] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step20000/mp_rank_00_model_states.pt [2025-04-08 12:46:06,595] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-08 12:46:06,604] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step20000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-08 12:56:52,340] [INFO] [logging.py:60:log_dist] [Rank 0] step=20010, skipped=20, lr=[0.0005754048530206279, 0.0005754048530206279], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20010 loss: 2.3807 iter time (s): 64.572 samples/sec: 15.858 %comms: 0.002832575793707595 %optimizer_step 0.05553227256070153 %forward: 22.62763661627588 %backward: 60.50002272751062 [2025-04-08 12:56:52,340] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36018.64 | forward: 146111.27 | backward_microstep: 390679.47 | backward: 390660.99 | backward_inner_microstep: 390641.79 | backward_inner: 390634.40 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.06 | reduce_tied_grads: 0.35 | comms: 18.29 | reduce_grads: 0.24 | step: 358.58 | _step_clipping: 0.13 | _step_step: 356.44 | _step_zero_grad: 0.59 | _step_check_overflow: 0.74 samples/sec: 15.450 | iteration 20010/ 143000 | elapsed time per iteration (ms): 66277.6 | learning rate: 5.754E-04 | approx flops per GPU: 66.7TFLOPS | lm_loss: 2.380287E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 13:07:29,527] [INFO] [logging.py:60:log_dist] [Rank 0] step=20020, skipped=20, lr=[0.000575378711206074, 0.000575378711206074], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20020 loss: 2.3766 iter time (s): 63.718 samples/sec: 16.071 %comms: 0.0028600584905235115 %optimizer_step 0.055307535396209744 %forward: 22.895331614786762 %backward: 61.255005324993675 [2025-04-08 13:07:29,527] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28118.02 | forward: 145884.81 | backward_microstep: 390316.75 | backward: 390305.54 | backward_inner_microstep: 390287.96 | backward_inner: 390281.35 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.90 | reduce_tied_grads: 0.32 | comms: 18.22 | reduce_grads: 0.20 | step: 352.41 | _step_clipping: 0.15 | _step_step: 350.68 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 16.071 | iteration 20020/ 143000 | elapsed time per iteration (ms): 63718.7 | learning rate: 5.754E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.369472E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 13:18:04,455] [INFO] [logging.py:60:log_dist] [Rank 0] step=20030, skipped=20, lr=[0.0005753525561005086, 0.0005753525561005086], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20030 loss: 2.3735 iter time (s): 63.492 samples/sec: 16.128 %comms: 0.002843124758311724 %optimizer_step 0.055768676571458585 %forward: 22.92984473476548 %backward: 61.467981503382994 [2025-04-08 13:18:04,455] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26149.78 | forward: 145586.63 | backward_microstep: 390285.10 | backward: 390273.73 | backward_inner_microstep: 390255.91 | backward_inner: 390249.12 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.94 | reduce_tied_grads: 0.36 | comms: 18.05 | reduce_grads: 0.22 | step: 354.09 | _step_clipping: 0.13 | _step_step: 352.38 | _step_zero_grad: 0.52 | _step_check_overflow: 0.46 samples/sec: 16.128 | iteration 20030/ 143000 | elapsed time per iteration (ms): 63492.8 | learning rate: 5.754E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.362882E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 13:28:41,351] [INFO] [logging.py:60:log_dist] [Rank 0] step=20040, skipped=20, lr=[0.0005753263877051943, 0.0005753263877051943], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20040 loss: 2.3775 iter time (s): 63.689 samples/sec: 16.078 %comms: 0.002878769499971828 %optimizer_step 0.056075005328770765 %forward: 22.903167603993367 %backward: 61.28661143504872 [2025-04-08 13:28:41,352] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27682.33 | forward: 145868.24 | backward_microstep: 390342.50 | backward: 390328.98 | backward_inner_microstep: 390311.26 | backward_inner: 390302.78 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.90 | reduce_tied_grads: 0.29 | comms: 18.33 | reduce_grads: 0.22 | step: 357.14 | _step_clipping: 0.12 | _step_step: 355.35 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.078 | iteration 20040/ 143000 | elapsed time per iteration (ms): 63689.7 | learning rate: 5.753E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.361134E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 13:39:19,199] [INFO] [logging.py:60:log_dist] [Rank 0] step=20050, skipped=20, lr=[0.0005753002060213938, 0.0005753002060213938], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20050 loss: 2.3711 iter time (s): 63.784 samples/sec: 16.054 %comms: 0.0028659573777568377 %optimizer_step 0.056970850359958025 %forward: 22.862282091030504 %backward: 61.20562872264704 [2025-04-08 13:39:19,199] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28637.22 | forward: 145825.14 | backward_microstep: 390408.32 | backward: 390394.95 | backward_inner_microstep: 390377.40 | backward_inner: 390370.60 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.28 | reduce_grads: 0.23 | step: 363.38 | _step_clipping: 0.12 | _step_step: 361.62 | _step_zero_grad: 0.51 | _step_check_overflow: 0.50 samples/sec: 16.054 | iteration 20050/ 143000 | elapsed time per iteration (ms): 63784.7 | learning rate: 5.753E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.380401E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 13:49:56,812] [INFO] [logging.py:60:log_dist] [Rank 0] step=20060, skipped=20, lr=[0.000575274011050371, 0.000575274011050371], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20060 loss: 2.3517 iter time (s): 63.761 samples/sec: 16.060 %comms: 0.002846706175580548 %optimizer_step 0.0551614852223745 %forward: 22.845125586193422 %backward: 61.245401906273436 [2025-04-08 13:49:56,812] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28385.23 | forward: 145662.19 | backward_microstep: 390524.62 | backward: 390505.15 | backward_inner_microstep: 390486.90 | backward_inner: 390479.62 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.93 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.20 | step: 351.71 | _step_clipping: 0.12 | _step_step: 349.95 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.060 | iteration 20060/ 143000 | elapsed time per iteration (ms): 63761.3 | learning rate: 5.753E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.356853E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 14:00:40,089] [INFO] [logging.py:60:log_dist] [Rank 0] step=20070, skipped=20, lr=[0.00057524780279339, 0.00057524780279339], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20070 loss: 2.3588 iter time (s): 64.327 samples/sec: 15.919 %comms: 0.002815893926343592 %optimizer_step 0.05612077521788474 %forward: 22.658284857470758 %backward: 60.67585296227171 [2025-04-08 14:00:40,090] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34167.34 | forward: 145754.36 | backward_microstep: 390324.69 | backward: 390310.65 | backward_inner_microstep: 390292.58 | backward_inner: 390285.59 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.91 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.21 | step: 361.01 | _step_clipping: 0.13 | _step_step: 359.05 | _step_zero_grad: 0.59 | _step_check_overflow: 0.63 samples/sec: 15.918 | iteration 20070/ 143000 | elapsed time per iteration (ms): 64327.8 | learning rate: 5.752E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.363035E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 14:11:27,028] [INFO] [logging.py:60:log_dist] [Rank 0] step=20080, skipped=20, lr=[0.0005752215812517157, 0.0005752215812517157], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20080 loss: 2.3706 iter time (s): 64.693 samples/sec: 15.829 %comms: 0.002868030623808089 %optimizer_step 0.058178175522068575 %forward: 22.507967087204072 %backward: 60.367592258878354 [2025-04-08 14:11:27,029] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37673.12 | forward: 145611.26 | backward_microstep: 390555.10 | backward: 390537.33 | backward_inner_microstep: 390518.92 | backward_inner: 390511.70 | backward_allreduce_microstep: 8.81 | backward_allreduce: 2.98 | reduce_tied_grads: 0.40 | comms: 18.55 | reduce_grads: 0.22 | step: 376.37 | _step_clipping: 0.14 | _step_step: 374.23 | _step_zero_grad: 0.58 | _step_check_overflow: 0.73 samples/sec: 15.828 | iteration 20080/ 143000 | elapsed time per iteration (ms): 64693.9 | learning rate: 5.752E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.371680E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 14:19:51,814] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-08 14:20:54,726] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-08 14:21:57,510] [INFO] [logging.py:60:log_dist] [Rank 0] step=20090, skipped=22, lr=[0.0005752005944542475, 0.0005752005944542475], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20090 loss: 2.3474 iter time (s): 63.048 samples/sec: 16.242 %comms: 0.002316358534573255 %optimizer_step 0.04827978069134577 %forward: 23.08931635808608 %backward: 61.93386536741685 [2025-04-08 14:21:57,511] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21326.42 | forward: 145572.67 | backward_microstep: 390492.77 | backward: 390478.34 | backward_inner_microstep: 390460.40 | backward_inner: 390453.40 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 14.60 | reduce_grads: 0.21 | step: 304.39 | _step_clipping: 0.12 | _step_step: 302.65 | _step_zero_grad: 0.53 | _step_check_overflow: 0.45 samples/sec: 16.242 | iteration 20090/ 143000 | elapsed time per iteration (ms): 63048.2 | learning rate: 5.752E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.355695E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-08 14:32:50,292] [INFO] [logging.py:60:log_dist] [Rank 0] step=20100, skipped=22, lr=[0.0005751743490033153, 0.0005751743490033153], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20100 loss: 2.3573 iter time (s): 65.278 samples/sec: 15.687 %comms: 0.0027947310783551934 %optimizer_step 0.05587011407143518 %forward: 22.328158398482273 %backward: 59.8580998834579 [2025-04-08 14:32:50,293] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 43136.37 | forward: 145752.69 | backward_microstep: 390759.82 | backward: 390738.86 | backward_inner_microstep: 390720.44 | backward_inner: 390712.86 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.24 | reduce_grads: 0.21 | step: 364.71 | _step_clipping: 0.13 | _step_step: 362.81 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 15.687 | iteration 20100/ 143000 | elapsed time per iteration (ms): 65278.2 | learning rate: 5.752E-04 | approx flops per GPU: 67.7TFLOPS | lm_loss: 2.360889E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 14:43:25,970] [INFO] [logging.py:60:log_dist] [Rank 0] step=20110, skipped=22, lr=[0.0005751480902712349, 0.0005751480902712349], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20110 loss: 2.3443 iter time (s): 63.567 samples/sec: 16.109 %comms: 0.0028627975810583154 %optimizer_step 0.05595792601214647 %forward: 22.936171396615336 %backward: 61.38482240761716 [2025-04-08 14:43:25,971] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26690.48 | forward: 145798.90 | backward_microstep: 390219.06 | backward: 390206.36 | backward_inner_microstep: 390188.66 | backward_inner: 390182.00 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.98 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.22 | step: 355.71 | _step_clipping: 0.13 | _step_step: 353.83 | _step_zero_grad: 0.50 | _step_check_overflow: 0.65 samples/sec: 16.109 | iteration 20110/ 143000 | elapsed time per iteration (ms): 63567.8 | learning rate: 5.751E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.357877E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 14:54:14,446] [INFO] [logging.py:60:log_dist] [Rank 0] step=20120, skipped=22, lr=[0.0005751218182592739, 0.0005751218182592739], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20120 loss: 2.3677 iter time (s): 64.847 samples/sec: 15.791 %comms: 0.0028209685492761776 %optimizer_step 0.056491580015340666 %forward: 22.47861689883071 %backward: 60.192813073499266 [2025-04-08 14:54:14,447] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39374.09 | forward: 145767.15 | backward_microstep: 390344.97 | backward: 390332.50 | backward_inner_microstep: 390314.62 | backward_inner: 390307.77 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.93 | reduce_tied_grads: 0.36 | comms: 18.29 | reduce_grads: 0.22 | step: 366.33 | _step_clipping: 0.12 | _step_step: 364.63 | _step_zero_grad: 0.51 | _step_check_overflow: 0.46 samples/sec: 15.791 | iteration 20120/ 143000 | elapsed time per iteration (ms): 64847.6 | learning rate: 5.751E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.368140E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 15:05:02,265] [INFO] [logging.py:60:log_dist] [Rank 0] step=20130, skipped=22, lr=[0.0005750955329687003, 0.0005750955329687003], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20130 loss: 2.3731 iter time (s): 64.781 samples/sec: 15.807 %comms: 0.0028429712588030646 %optimizer_step 0.055911400053331506 %forward: 22.50907101032299 %backward: 60.28708948476484 [2025-04-08 15:05:02,265] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38430.41 | forward: 145816.55 | backward_microstep: 390564.74 | backward: 390547.23 | backward_inner_microstep: 390529.08 | backward_inner: 390521.97 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.42 | reduce_grads: 0.21 | step: 362.20 | _step_clipping: 0.13 | _step_step: 360.02 | _step_zero_grad: 0.58 | _step_check_overflow: 0.77 samples/sec: 15.807 | iteration 20130/ 143000 | elapsed time per iteration (ms): 64781.8 | learning rate: 5.751E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.358602E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 15:15:41,332] [INFO] [logging.py:60:log_dist] [Rank 0] step=20140, skipped=22, lr=[0.0005750692344007826, 0.0005750692344007826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20140 loss: 2.3585 iter time (s): 63.906 samples/sec: 16.024 %comms: 0.0028681376416590464 %optimizer_step 0.05647315188017279 %forward: 22.848662084973164 %backward: 61.136278272556375 [2025-04-08 15:15:41,332] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29308.65 | forward: 146016.84 | backward_microstep: 390719.12 | backward: 390697.99 | backward_inner_microstep: 390677.30 | backward_inner: 390668.31 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 18.33 | reduce_grads: 0.22 | step: 360.90 | _step_clipping: 0.13 | _step_step: 358.93 | _step_zero_grad: 0.60 | _step_check_overflow: 0.59 samples/sec: 16.023 | iteration 20140/ 143000 | elapsed time per iteration (ms): 63906.7 | learning rate: 5.751E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.365952E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 15:26:22,261] [INFO] [logging.py:60:log_dist] [Rank 0] step=20150, skipped=22, lr=[0.0005750429225567903, 0.0005750429225567903], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20150 loss: 2.3580 iter time (s): 64.092 samples/sec: 15.977 %comms: 0.0029481191153491195 %optimizer_step 0.05951092322046014 %forward: 22.786620144341157 %backward: 60.95386943038824 [2025-04-08 15:26:22,261] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31162.34 | forward: 146044.50 | backward_microstep: 390685.52 | backward: 390666.87 | backward_inner_microstep: 390648.31 | backward_inner: 390640.96 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.98 | reduce_tied_grads: 0.37 | comms: 18.90 | reduce_grads: 0.23 | step: 381.42 | _step_clipping: 0.15 | _step_step: 379.16 | _step_zero_grad: 0.65 | _step_check_overflow: 0.70 samples/sec: 15.977 | iteration 20150/ 143000 | elapsed time per iteration (ms): 64092.9 | learning rate: 5.750E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.373152E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 15:37:01,560] [INFO] [logging.py:60:log_dist] [Rank 0] step=20160, skipped=22, lr=[0.0005750165974379932, 0.0005750165974379932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20160 loss: 2.3662 iter time (s): 63.929 samples/sec: 16.018 %comms: 0.002827487539455858 %optimizer_step 0.05726900430665892 %forward: 22.800228278669916 %backward: 61.08414052345002 [2025-04-08 15:37:01,561] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30040.20 | forward: 145760.37 | backward_microstep: 390521.72 | backward: 390506.92 | backward_inner_microstep: 390488.45 | backward_inner: 390481.31 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.99 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.20 | step: 366.12 | _step_clipping: 0.12 | _step_step: 360.58 | _step_zero_grad: 2.33 | _step_check_overflow: 0.72 samples/sec: 16.018 | iteration 20160/ 143000 | elapsed time per iteration (ms): 63929.9 | learning rate: 5.750E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.362067E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 15:47:41,831] [INFO] [logging.py:60:log_dist] [Rank 0] step=20170, skipped=22, lr=[0.0005749902590456619, 0.0005749902590456619], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20170 loss: 2.3474 iter time (s): 64.027 samples/sec: 15.993 %comms: 0.0028257648718149845 %optimizer_step 0.05862211861199381 %forward: 22.741548126748818 %backward: 60.97114095992572 [2025-04-08 15:47:41,832] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31298.28 | forward: 145606.26 | backward_microstep: 390390.02 | backward: 390377.10 | backward_inner_microstep: 390356.97 | backward_inner: 390346.58 | backward_allreduce_microstep: 10.46 | backward_allreduce: 4.73 | reduce_tied_grads: 0.33 | comms: 18.09 | reduce_grads: 0.23 | step: 375.34 | _step_clipping: 0.13 | _step_step: 373.52 | _step_zero_grad: 0.54 | _step_check_overflow: 0.53 samples/sec: 15.993 | iteration 20170/ 143000 | elapsed time per iteration (ms): 64027.1 | learning rate: 5.750E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.354303E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 15:58:23,720] [INFO] [logging.py:60:log_dist] [Rank 0] step=20180, skipped=22, lr=[0.0005749639073810677, 0.0005749639073810677], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20180 loss: 2.3432 iter time (s): 64.188 samples/sec: 15.953 %comms: 0.0029666243869668997 %optimizer_step 0.057319860281778726 %forward: 22.711770131972038 %backward: 60.857360680534136 [2025-04-08 15:58:23,721] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32442.20 | forward: 145782.96 | backward_microstep: 390650.38 | backward: 390632.97 | backward_inner_microstep: 390614.17 | backward_inner: 390606.88 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.01 | reduce_tied_grads: 0.56 | comms: 19.04 | reduce_grads: 0.23 | step: 367.93 | _step_clipping: 0.13 | _step_step: 365.52 | _step_zero_grad: 0.63 | _step_check_overflow: 0.88 samples/sec: 15.953 | iteration 20180/ 143000 | elapsed time per iteration (ms): 64188.9 | learning rate: 5.750E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.358390E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 16:09:12,862] [INFO] [logging.py:60:log_dist] [Rank 0] step=20190, skipped=22, lr=[0.0005749375424454823, 0.0005749375424454823], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20190 loss: 2.3472 iter time (s): 64.913 samples/sec: 15.775 %comms: 0.002821902147145093 %optimizer_step 0.05462937522630292 %forward: 22.435646053427174 %backward: 60.16868668523484 [2025-04-08 16:09:12,862] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39947.84 | forward: 145637.50 | backward_microstep: 390592.52 | backward: 390575.65 | backward_inner_microstep: 390555.73 | backward_inner: 390548.61 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.93 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.22 | step: 354.62 | _step_clipping: 0.15 | _step_step: 352.77 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 15.775 | iteration 20190/ 143000 | elapsed time per iteration (ms): 64914.1 | learning rate: 5.749E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.362170E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 16:20:10,229] [INFO] [logging.py:60:log_dist] [Rank 0] step=20200, skipped=22, lr=[0.0005749111642401781, 0.0005749111642401781], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20200 loss: 2.3650 iter time (s): 65.736 samples/sec: 15.577 %comms: 0.0027703751808769756 %optimizer_step 0.05438975778136192 %forward: 22.192476724344772 %backward: 59.40308871944845 [2025-04-08 16:20:10,229] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 48000.50 | forward: 145884.68 | backward_microstep: 390512.01 | backward: 390492.71 | backward_inner_microstep: 390474.71 | backward_inner: 390467.60 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.95 | reduce_tied_grads: 0.34 | comms: 18.21 | reduce_grads: 0.22 | step: 357.54 | _step_clipping: 0.13 | _step_step: 355.69 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 15.577 | iteration 20200/ 143000 | elapsed time per iteration (ms): 65736.7 | learning rate: 5.749E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.357922E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 16:30:53,469] [INFO] [logging.py:60:log_dist] [Rank 0] step=20210, skipped=22, lr=[0.0005748847727664285, 0.0005748847727664285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20210 loss: 2.3626 iter time (s): 64.323 samples/sec: 15.920 %comms: 0.0028298109002754673 %optimizer_step 0.0558802957890367 %forward: 22.679243099085276 %backward: 60.72793269659537 [2025-04-08 16:30:53,470] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33710.30 | forward: 145880.59 | backward_microstep: 390641.04 | backward: 390622.69 | backward_inner_microstep: 390603.62 | backward_inner: 390595.93 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.12 | reduce_tied_grads: 0.37 | comms: 18.20 | reduce_grads: 0.23 | step: 359.44 | _step_clipping: 0.14 | _step_step: 357.33 | _step_zero_grad: 0.59 | _step_check_overflow: 0.69 samples/sec: 15.919 | iteration 20210/ 143000 | elapsed time per iteration (ms): 64324.0 | learning rate: 5.749E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.352311E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 16:41:27,948] [INFO] [logging.py:60:log_dist] [Rank 0] step=20220, skipped=22, lr=[0.0005748583680255071, 0.0005748583680255071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20220 loss: 2.3396 iter time (s): 63.447 samples/sec: 16.139 %comms: 0.002856826312144656 %optimizer_step 0.056395122978985 %forward: 22.936720438916836 %backward: 61.5238504020724 [2025-04-08 16:41:27,948] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25626.37 | forward: 145527.14 | backward_microstep: 390365.18 | backward: 390351.81 | backward_inner_microstep: 390333.57 | backward_inner: 390326.69 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.99 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.21 | step: 357.81 | _step_clipping: 0.13 | _step_step: 356.06 | _step_zero_grad: 0.54 | _step_check_overflow: 0.47 samples/sec: 16.139 | iteration 20220/ 143000 | elapsed time per iteration (ms): 63447.9 | learning rate: 5.749E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.349995E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 16:52:04,628] [INFO] [logging.py:60:log_dist] [Rank 0] step=20230, skipped=22, lr=[0.0005748319500186884, 0.0005748319500186884], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20230 loss: 2.3608 iter time (s): 63.668 samples/sec: 16.084 %comms: 0.0028546184427471317 %optimizer_step 0.05680110265728177 %forward: 22.871816951535948 %backward: 61.35494765087945 [2025-04-08 16:52:04,629] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27433.43 | forward: 145619.19 | backward_microstep: 390653.46 | backward: 390631.75 | backward_inner_microstep: 390609.53 | backward_inner: 390600.41 | backward_allreduce_microstep: 10.61 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.22 | step: 361.64 | _step_clipping: 0.12 | _step_step: 359.76 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.083 | iteration 20230/ 143000 | elapsed time per iteration (ms): 63668.1 | learning rate: 5.748E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.358891E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 17:02:49,370] [INFO] [logging.py:60:log_dist] [Rank 0] step=20240, skipped=22, lr=[0.0005748055187472474, 0.0005748055187472474], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20240 loss: 2.3599 iter time (s): 64.474 samples/sec: 15.882 %comms: 0.0028000340622059885 %optimizer_step 0.05563813638819368 %forward: 22.576343863912022 %backward: 60.503150401086714 [2025-04-08 17:02:49,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36183.46 | forward: 145557.73 | backward_microstep: 390095.94 | backward: 390085.35 | backward_inner_microstep: 390066.50 | backward_inner: 390059.92 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.83 | reduce_tied_grads: 0.31 | comms: 18.05 | reduce_grads: 0.21 | step: 358.72 | _step_clipping: 0.13 | _step_step: 356.97 | _step_zero_grad: 0.52 | _step_check_overflow: 0.48 samples/sec: 15.882 | iteration 20240/ 143000 | elapsed time per iteration (ms): 64474.1 | learning rate: 5.748E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.359558E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 17:13:32,830] [INFO] [logging.py:60:log_dist] [Rank 0] step=20250, skipped=22, lr=[0.0005747790742124598, 0.0005747790742124598], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20250 loss: 2.3694 iter time (s): 64.345 samples/sec: 15.914 %comms: 0.0029160280258738814 %optimizer_step 0.05704916355370668 %forward: 22.6384948926995 %backward: 60.68180864422725 [2025-04-08 17:13:32,830] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34380.42 | forward: 145668.34 | backward_microstep: 390475.49 | backward: 390459.62 | backward_inner_microstep: 390439.44 | backward_inner: 390432.13 | backward_allreduce_microstep: 10.47 | backward_allreduce: 3.05 | reduce_tied_grads: 0.36 | comms: 18.76 | reduce_grads: 0.25 | step: 367.09 | _step_clipping: 0.16 | _step_step: 365.03 | _step_zero_grad: 0.58 | _step_check_overflow: 0.61 samples/sec: 15.914 | iteration 20250/ 143000 | elapsed time per iteration (ms): 64346.0 | learning rate: 5.748E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.368330E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 17:24:11,474] [INFO] [logging.py:60:log_dist] [Rank 0] step=20260, skipped=22, lr=[0.0005747526164156019, 0.0005747526164156019], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20260 loss: 2.3435 iter time (s): 63.864 samples/sec: 16.034 %comms: 0.0029583631763159053 %optimizer_step 0.059750991841709504 %forward: 22.809373086835908 %backward: 61.15862079246583 [2025-04-08 17:24:11,475] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29384.78 | forward: 145669.40 | backward_microstep: 390599.91 | backward: 390582.41 | backward_inner_microstep: 390563.09 | backward_inner: 390555.28 | backward_allreduce_microstep: 9.14 | backward_allreduce: 3.14 | reduce_tied_grads: 0.36 | comms: 18.89 | reduce_grads: 0.25 | step: 381.59 | _step_clipping: 0.13 | _step_step: 379.37 | _step_zero_grad: 0.77 | _step_check_overflow: 0.63 samples/sec: 16.034 | iteration 20260/ 143000 | elapsed time per iteration (ms): 63864.5 | learning rate: 5.748E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.359422E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 17:34:57,206] [INFO] [logging.py:60:log_dist] [Rank 0] step=20270, skipped=22, lr=[0.0005747261453579507, 0.0005747261453579507], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20270 loss: 2.3552 iter time (s): 64.573 samples/sec: 15.858 %comms: 0.0028835805285245997 %optimizer_step 0.056874008755116584 %forward: 22.610342823228063 %backward: 60.49609102723726 [2025-04-08 17:34:57,207] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36078.36 | forward: 146000.74 | backward_microstep: 390655.15 | backward: 390638.67 | backward_inner_microstep: 390619.60 | backward_inner: 390610.56 | backward_allreduce_microstep: 9.12 | backward_allreduce: 3.25 | reduce_tied_grads: 0.37 | comms: 18.62 | reduce_grads: 0.23 | step: 367.25 | _step_clipping: 0.14 | _step_step: 365.29 | _step_zero_grad: 0.59 | _step_check_overflow: 0.56 samples/sec: 15.858 | iteration 20270/ 143000 | elapsed time per iteration (ms): 64573.2 | learning rate: 5.747E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.363967E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 17:45:40,419] [INFO] [logging.py:60:log_dist] [Rank 0] step=20280, skipped=22, lr=[0.0005746996610407839, 0.0005746996610407839], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20280 loss: 2.3489 iter time (s): 64.321 samples/sec: 15.920 %comms: 0.0029005442171339416 %optimizer_step 0.059905276916640375 %forward: 22.672996580753917 %backward: 60.76231509965635 [2025-04-08 17:45:40,420] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33474.43 | forward: 145834.21 | backward_microstep: 390847.38 | backward: 390827.23 | backward_inner_microstep: 390808.86 | backward_inner: 390801.37 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.92 | reduce_tied_grads: 0.39 | comms: 18.66 | reduce_grads: 0.27 | step: 385.31 | _step_clipping: 0.18 | _step_step: 383.08 | _step_zero_grad: 0.57 | _step_check_overflow: 0.73 samples/sec: 15.920 | iteration 20280/ 143000 | elapsed time per iteration (ms): 64321.3 | learning rate: 5.747E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.359235E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 17:56:19,021] [INFO] [logging.py:60:log_dist] [Rank 0] step=20290, skipped=22, lr=[0.0005746731634653796, 0.0005746731634653796], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20290 loss: 2.3580 iter time (s): 63.859 samples/sec: 16.035 %comms: 0.0028975220937189025 %optimizer_step 0.05681523775138179 %forward: 22.811686415033165 %backward: 61.179867546820155 [2025-04-08 17:56:19,021] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29211.46 | forward: 145674.26 | backward_microstep: 390713.13 | backward: 390691.50 | backward_inner_microstep: 390671.93 | backward_inner: 390664.65 | backward_allreduce_microstep: 9.96 | backward_allreduce: 2.87 | reduce_tied_grads: 0.34 | comms: 18.50 | reduce_grads: 0.20 | step: 362.82 | _step_clipping: 0.12 | _step_step: 360.82 | _step_zero_grad: 0.55 | _step_check_overflow: 0.64 samples/sec: 16.035 | iteration 20290/ 143000 | elapsed time per iteration (ms): 63860.1 | learning rate: 5.747E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.358217E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 18:06:57,443] [INFO] [logging.py:60:log_dist] [Rank 0] step=20300, skipped=22, lr=[0.0005746466526330168, 0.0005746466526330168], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20300 loss: 2.3782 iter time (s): 63.842 samples/sec: 16.040 %comms: 0.002889151030836394 %optimizer_step 0.05780893831401729 %forward: 22.836159565046007 %backward: 61.183380562230674 [2025-04-08 18:06:57,443] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28983.56 | forward: 145789.49 | backward_microstep: 390619.64 | backward: 390603.94 | backward_inner_microstep: 390582.21 | backward_inner: 390571.62 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.97 | reduce_tied_grads: 0.56 | comms: 18.44 | reduce_grads: 0.23 | step: 369.06 | _step_clipping: 0.14 | _step_step: 366.97 | _step_zero_grad: 0.64 | _step_check_overflow: 0.63 samples/sec: 16.040 | iteration 20300/ 143000 | elapsed time per iteration (ms): 63842.2 | learning rate: 5.746E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.362152E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 18:17:37,555] [INFO] [logging.py:60:log_dist] [Rank 0] step=20310, skipped=22, lr=[0.000574620128544975, 0.000574620128544975], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20310 loss: 2.3482 iter time (s): 64.011 samples/sec: 15.997 %comms: 0.0028702734775495186 %optimizer_step 0.057901782784511184 %forward: 22.767708718296177 %backward: 60.99583229668583 [2025-04-08 18:17:37,555] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30966.06 | forward: 145737.32 | backward_microstep: 390450.13 | backward: 390437.59 | backward_inner_microstep: 390419.02 | backward_inner: 390410.23 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.10 | reduce_tied_grads: 0.41 | comms: 18.37 | reduce_grads: 0.25 | step: 370.63 | _step_clipping: 0.15 | _step_step: 368.56 | _step_zero_grad: 0.60 | _step_check_overflow: 0.63 samples/sec: 15.997 | iteration 20310/ 143000 | elapsed time per iteration (ms): 64011.2 | learning rate: 5.746E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.362131E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 18:28:19,927] [INFO] [logging.py:60:log_dist] [Rank 0] step=20320, skipped=22, lr=[0.0005745935912025343, 0.0005745935912025343], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20320 loss: 2.3511 iter time (s): 64.237 samples/sec: 15.941 %comms: 0.002852269332166497 %optimizer_step 0.05837138307994483 %forward: 22.744804220792375 %backward: 60.837806786349766 [2025-04-08 18:28:19,928] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32407.73 | forward: 146104.75 | backward_microstep: 390819.13 | backward: 390801.01 | backward_inner_microstep: 390781.85 | backward_inner: 390774.07 | backward_allreduce_microstep: 8.98 | backward_allreduce: 3.10 | reduce_tied_grads: 0.37 | comms: 18.32 | reduce_grads: 0.23 | step: 374.96 | _step_clipping: 0.13 | _step_step: 372.89 | _step_zero_grad: 0.62 | _step_check_overflow: 0.64 samples/sec: 15.941 | iteration 20320/ 143000 | elapsed time per iteration (ms): 64237.2 | learning rate: 5.746E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.362133E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 18:39:02,150] [INFO] [logging.py:60:log_dist] [Rank 0] step=20330, skipped=22, lr=[0.0005745670406069755, 0.0005745670406069755], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20330 loss: 2.3535 iter time (s): 64.222 samples/sec: 15.945 %comms: 0.0028521896509017246 %optimizer_step 0.058949905703981 %forward: 22.737202794282645 %backward: 60.82193262978583 [2025-04-08 18:39:02,150] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32540.81 | forward: 146021.99 | backward_microstep: 390625.77 | backward: 390608.27 | backward_inner_microstep: 390589.89 | backward_inner: 390582.76 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.97 | reduce_tied_grads: 0.31 | comms: 18.32 | reduce_grads: 0.20 | step: 378.59 | _step_clipping: 0.13 | _step_step: 376.76 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 15.945 | iteration 20330/ 143000 | elapsed time per iteration (ms): 64222.2 | learning rate: 5.746E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.350457E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 18:49:46,763] [INFO] [logging.py:60:log_dist] [Rank 0] step=20340, skipped=22, lr=[0.0005745404767595803, 0.0005745404767595803], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20340 loss: 2.3551 iter time (s): 64.461 samples/sec: 15.886 %comms: 0.002841091902639934 %optimizer_step 0.05893899041563634 %forward: 22.637940079967763 %backward: 60.58995245618337 [2025-04-08 18:49:46,764] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35080.58 | forward: 145925.81 | backward_microstep: 390583.83 | backward: 390567.23 | backward_inner_microstep: 390548.93 | backward_inner: 390541.89 | backward_allreduce_microstep: 8.72 | backward_allreduce: 2.89 | reduce_tied_grads: 0.32 | comms: 18.31 | reduce_grads: 0.22 | step: 379.93 | _step_clipping: 0.13 | _step_step: 377.96 | _step_zero_grad: 0.59 | _step_check_overflow: 0.57 samples/sec: 15.885 | iteration 20340/ 143000 | elapsed time per iteration (ms): 64461.4 | learning rate: 5.745E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.348362E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 19:00:19,369] [INFO] [logging.py:60:log_dist] [Rank 0] step=20350, skipped=22, lr=[0.0005745138996616305, 0.0005745138996616305], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20350 loss: 2.3861 iter time (s): 63.260 samples/sec: 16.187 %comms: 0.0029151445795102637 %optimizer_step 0.05584803894777883 %forward: 23.005285788634293 %backward: 61.72235870711285 [2025-04-08 19:00:19,370] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23659.76 | forward: 145531.40 | backward_microstep: 390468.87 | backward: 390455.55 | backward_inner_microstep: 390436.55 | backward_inner: 390429.88 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 18.44 | reduce_grads: 0.20 | step: 353.29 | _step_clipping: 0.11 | _step_step: 351.42 | _step_zero_grad: 0.51 | _step_check_overflow: 0.66 samples/sec: 16.187 | iteration 20350/ 143000 | elapsed time per iteration (ms): 63260.6 | learning rate: 5.745E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.371391E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 19:10:56,123] [INFO] [logging.py:60:log_dist] [Rank 0] step=20360, skipped=22, lr=[0.000574487309314409, 0.000574487309314409], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20360 loss: 2.3460 iter time (s): 63.675 samples/sec: 16.082 %comms: 0.002842722225985198 %optimizer_step 0.05701634874785111 %forward: 22.881438664136873 %backward: 61.31734610107381 [2025-04-08 19:10:56,124] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27637.01 | forward: 145697.10 | backward_microstep: 390450.17 | backward: 390436.98 | backward_inner_microstep: 390417.17 | backward_inner: 390408.49 | backward_allreduce_microstep: 8.70 | backward_allreduce: 3.15 | reduce_tied_grads: 0.36 | comms: 18.10 | reduce_grads: 0.22 | step: 363.05 | _step_clipping: 0.14 | _step_step: 361.27 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.082 | iteration 20360/ 143000 | elapsed time per iteration (ms): 63675.4 | learning rate: 5.745E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.368824E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 19:21:39,162] [INFO] [logging.py:60:log_dist] [Rank 0] step=20370, skipped=22, lr=[0.0005744607057191991, 0.0005744607057191991], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20370 loss: 2.3485 iter time (s): 64.303 samples/sec: 15.925 %comms: 0.002856128088007234 %optimizer_step 0.05605485067189117 %forward: 22.678668113036448 %backward: 60.72006086524932 [2025-04-08 19:21:39,163] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33745.00 | forward: 145831.43 | backward_microstep: 390468.54 | backward: 390450.32 | backward_inner_microstep: 390432.18 | backward_inner: 390424.99 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.37 | reduce_grads: 0.21 | step: 360.45 | _step_clipping: 0.14 | _step_step: 358.38 | _step_zero_grad: 0.59 | _step_check_overflow: 0.66 samples/sec: 15.924 | iteration 20370/ 143000 | elapsed time per iteration (ms): 64304.0 | learning rate: 5.745E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.356572E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 19:32:23,915] [INFO] [logging.py:60:log_dist] [Rank 0] step=20380, skipped=22, lr=[0.0005744340888772848, 0.0005744340888772848], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20380 loss: 2.3553 iter time (s): 64.475 samples/sec: 15.882 %comms: 0.002836010117640727 %optimizer_step 0.05570683710091637 %forward: 22.60771727523634 %backward: 60.58150946585816 [2025-04-08 19:32:23,915] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35368.70 | forward: 145762.15 | backward_microstep: 390612.79 | backward: 390596.31 | backward_inner_microstep: 390576.51 | backward_inner: 390569.29 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 18.29 | reduce_grads: 0.21 | step: 359.17 | _step_clipping: 0.13 | _step_step: 357.34 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 15.882 | iteration 20380/ 143000 | elapsed time per iteration (ms): 64475.2 | learning rate: 5.744E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.351778E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 19:42:56,544] [INFO] [logging.py:60:log_dist] [Rank 0] step=20390, skipped=22, lr=[0.0005744074587899507, 0.0005744074587899507], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20390 loss: 2.3659 iter time (s): 63.262 samples/sec: 16.187 %comms: 0.002924081886875867 %optimizer_step 0.058220012531832274 %forward: 23.01287938866253 %backward: 61.719964284368075 [2025-04-08 19:42:56,545] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23586.00 | forward: 145584.82 | backward_microstep: 390469.55 | backward: 390454.82 | backward_inner_microstep: 390433.42 | backward_inner: 390424.73 | backward_allreduce_microstep: 10.32 | backward_allreduce: 2.93 | reduce_tied_grads: 0.34 | comms: 18.50 | reduce_grads: 0.22 | step: 368.31 | _step_clipping: 0.13 | _step_step: 366.40 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.186 | iteration 20390/ 143000 | elapsed time per iteration (ms): 63262.9 | learning rate: 5.744E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.375455E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 19:53:33,312] [INFO] [logging.py:60:log_dist] [Rank 0] step=20400, skipped=22, lr=[0.0005743808154584822, 0.0005743808154584822], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20400 loss: 2.3343 iter time (s): 63.676 samples/sec: 16.081 %comms: 0.002857556898086226 %optimizer_step 0.05554838398722958 %forward: 22.87436948249857 %backward: 61.328022727891565 [2025-04-08 19:53:33,313] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27575.69 | forward: 145655.54 | backward_microstep: 390529.28 | backward: 390514.21 | backward_inner_microstep: 390496.26 | backward_inner: 390489.03 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.20 | reduce_grads: 0.20 | step: 353.71 | _step_clipping: 0.12 | _step_step: 351.51 | _step_zero_grad: 0.54 | _step_check_overflow: 0.92 samples/sec: 16.081 | iteration 20400/ 143000 | elapsed time per iteration (ms): 63676.9 | learning rate: 5.744E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.363992E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 20:04:07,747] [INFO] [logging.py:60:log_dist] [Rank 0] step=20410, skipped=22, lr=[0.0005743541588841653, 0.0005743541588841653], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20410 loss: 2.3474 iter time (s): 63.443 samples/sec: 16.141 %comms: 0.0029184733752435376 %optimizer_step 0.05758982999981278 %forward: 22.973225784705807 %backward: 61.57643148471536 [2025-04-08 20:04:07,747] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24965.04 | forward: 145748.38 | backward_microstep: 390674.04 | backward: 390657.59 | backward_inner_microstep: 390639.24 | backward_inner: 390631.99 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 18.52 | reduce_grads: 0.21 | step: 365.37 | _step_clipping: 0.15 | _step_step: 363.36 | _step_zero_grad: 0.56 | _step_check_overflow: 0.66 samples/sec: 16.140 | iteration 20410/ 143000 | elapsed time per iteration (ms): 63443.4 | learning rate: 5.744E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.356668E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 20:14:40,117] [INFO] [logging.py:60:log_dist] [Rank 0] step=20420, skipped=22, lr=[0.0005743274890682863, 0.0005743274890682863], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20420 loss: 2.3636 iter time (s): 63.236 samples/sec: 16.193 %comms: 0.0028915356199163285 %optimizer_step 0.058374725117346205 %forward: 23.02410402046803 %backward: 61.75447582600043 [2025-04-08 20:14:40,117] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23211.78 | forward: 145596.19 | backward_microstep: 390528.05 | backward: 390513.20 | backward_inner_microstep: 390495.24 | backward_inner: 390488.35 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.95 | reduce_tied_grads: 0.36 | comms: 18.29 | reduce_grads: 0.22 | step: 369.14 | _step_clipping: 0.13 | _step_step: 367.10 | _step_zero_grad: 0.56 | _step_check_overflow: 0.71 samples/sec: 16.193 | iteration 20420/ 143000 | elapsed time per iteration (ms): 63237.0 | learning rate: 5.743E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.368929E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 20:25:19,878] [INFO] [logging.py:60:log_dist] [Rank 0] step=20430, skipped=22, lr=[0.0005743008060121327, 0.0005743008060121327], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20430 loss: 2.3683 iter time (s): 63.976 samples/sec: 16.006 %comms: 0.00283994430329779 %optimizer_step 0.056417046694108956 %forward: 22.760762184062056 %backward: 61.04127707758787 [2025-04-08 20:25:19,879] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30602.82 | forward: 145613.13 | backward_microstep: 390531.55 | backward: 390514.67 | backward_inner_microstep: 390494.77 | backward_inner: 390487.77 | backward_allreduce_microstep: 8.57 | backward_allreduce: 3.00 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.22 | step: 360.93 | _step_clipping: 0.12 | _step_step: 359.04 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.006 | iteration 20430/ 143000 | elapsed time per iteration (ms): 63976.1 | learning rate: 5.743E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.368978E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 20:36:02,061] [INFO] [logging.py:60:log_dist] [Rank 0] step=20440, skipped=22, lr=[0.000574274109716992, 0.000574274109716992], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20440 loss: 2.3464 iter time (s): 64.218 samples/sec: 15.946 %comms: 0.0028656899780256746 %optimizer_step 0.05649630921076357 %forward: 22.687449845714948 %backward: 60.78784425657393 [2025-04-08 20:36:02,062] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33124.57 | forward: 145693.69 | backward_microstep: 390380.77 | backward: 390365.83 | backward_inner_microstep: 390348.17 | backward_inner: 390339.62 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.40 | reduce_grads: 0.21 | step: 362.81 | _step_clipping: 0.12 | _step_step: 360.72 | _step_zero_grad: 0.51 | _step_check_overflow: 0.82 samples/sec: 15.946 | iteration 20440/ 143000 | elapsed time per iteration (ms): 64218.3 | learning rate: 5.743E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.364081E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 20:46:53,295] [INFO] [logging.py:60:log_dist] [Rank 0] step=20450, skipped=22, lr=[0.0005742474001841531, 0.0005742474001841531], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20450 loss: 2.3703 iter time (s): 65.123 samples/sec: 15.724 %comms: 0.00283047665110749 %optimizer_step 0.05673020161659122 %forward: 22.416287338722338 %backward: 60.005106814999586 [2025-04-08 20:46:53,296] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41419.90 | forward: 145981.13 | backward_microstep: 390790.57 | backward: 390770.04 | backward_inner_microstep: 390751.04 | backward_inner: 390743.39 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.10 | reduce_tied_grads: 0.35 | comms: 18.43 | reduce_grads: 0.23 | step: 369.44 | _step_clipping: 0.14 | _step_step: 367.34 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 15.724 | iteration 20450/ 143000 | elapsed time per iteration (ms): 65123.4 | learning rate: 5.742E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.353257E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 20:57:32,982] [INFO] [logging.py:60:log_dist] [Rank 0] step=20460, skipped=22, lr=[0.0005742206774149048, 0.0005742206774149048], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20460 loss: 2.3739 iter time (s): 63.968 samples/sec: 16.008 %comms: 0.002838562117766295 %optimizer_step 0.0563792577206465 %forward: 22.807898050957505 %backward: 61.0529567141584 [2025-04-08 20:57:32,983] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30213.32 | forward: 145897.61 | backward_microstep: 390559.70 | backward: 390543.69 | backward_inner_microstep: 390524.97 | backward_inner: 390517.83 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.04 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.22 | step: 360.65 | _step_clipping: 0.14 | _step_step: 358.83 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.008 | iteration 20460/ 143000 | elapsed time per iteration (ms): 63968.6 | learning rate: 5.742E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.373436E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 21:08:10,896] [INFO] [logging.py:60:log_dist] [Rank 0] step=20470, skipped=22, lr=[0.0005741939414105371, 0.0005741939414105371], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20470 loss: 2.3504 iter time (s): 63.791 samples/sec: 16.052 %comms: 0.002823911330529724 %optimizer_step 0.05475964581224071 %forward: 22.81440355939903 %backward: 61.16450955661288 [2025-04-08 21:08:10,896] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29250.36 | forward: 145534.89 | backward_microstep: 390183.57 | backward: 390173.25 | backward_inner_microstep: 390155.96 | backward_inner: 390149.37 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.20 | step: 349.32 | _step_clipping: 0.14 | _step_step: 347.63 | _step_zero_grad: 0.48 | _step_check_overflow: 0.48 samples/sec: 16.052 | iteration 20470/ 143000 | elapsed time per iteration (ms): 63791.4 | learning rate: 5.742E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.365150E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 21:18:45,916] [INFO] [logging.py:60:log_dist] [Rank 0] step=20480, skipped=22, lr=[0.00057416719217234, 0.00057416719217234], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20480 loss: 2.3686 iter time (s): 63.501 samples/sec: 16.126 %comms: 0.0028686560932024182 %optimizer_step 0.05684628484683986 %forward: 22.93927935377625 %backward: 61.51552460171818 [2025-04-08 21:18:45,916] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25649.81 | forward: 145667.67 | backward_microstep: 390648.88 | backward: 390632.27 | backward_inner_microstep: 390614.07 | backward_inner: 390606.82 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.35 | comms: 18.22 | reduce_grads: 0.21 | step: 360.98 | _step_clipping: 0.12 | _step_step: 359.16 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 16.125 | iteration 20480/ 143000 | elapsed time per iteration (ms): 63502.0 | learning rate: 5.742E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.355267E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 21:29:23,415] [INFO] [logging.py:60:log_dist] [Rank 0] step=20490, skipped=22, lr=[0.0005741404297016049, 0.0005741404297016049], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20490 loss: 2.3664 iter time (s): 63.749 samples/sec: 16.063 %comms: 0.0028543949179362844 %optimizer_step 0.056959132318757474 %forward: 22.874262594138877 %backward: 61.25296476233654 [2025-04-08 21:29:23,416] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28131.24 | forward: 145821.94 | backward_microstep: 390497.74 | backward: 390483.68 | backward_inner_microstep: 390465.84 | backward_inner: 390458.81 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.92 | reduce_tied_grads: 0.39 | comms: 18.20 | reduce_grads: 0.22 | step: 363.11 | _step_clipping: 0.19 | _step_step: 360.93 | _step_zero_grad: 0.51 | _step_check_overflow: 0.83 samples/sec: 16.063 | iteration 20490/ 143000 | elapsed time per iteration (ms): 63750.0 | learning rate: 5.741E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.359817E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 21:40:11,353] [INFO] [logging.py:60:log_dist] [Rank 0] step=20500, skipped=22, lr=[0.0005741136539996235, 0.0005741136539996235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20500 loss: 2.3379 iter time (s): 64.793 samples/sec: 15.804 %comms: 0.0028178324496491475 %optimizer_step 0.056363530011918996 %forward: 22.500815716277277 %backward: 60.26394069662935 [2025-04-08 21:40:11,354] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38636.20 | forward: 145789.82 | backward_microstep: 390483.64 | backward: 390468.92 | backward_inner_microstep: 390449.22 | backward_inner: 390442.28 | backward_allreduce_microstep: 8.62 | backward_allreduce: 3.12 | reduce_tied_grads: 0.32 | comms: 18.26 | reduce_grads: 0.21 | step: 365.20 | _step_clipping: 0.12 | _step_step: 363.36 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 15.804 | iteration 20500/ 143000 | elapsed time per iteration (ms): 64793.8 | learning rate: 5.741E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.358271E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 21:50:49,225] [INFO] [logging.py:60:log_dist] [Rank 0] step=20510, skipped=22, lr=[0.000574086865067688, 0.000574086865067688], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20510 loss: 2.3354 iter time (s): 63.787 samples/sec: 16.054 %comms: 0.002885842761398978 %optimizer_step 0.05609754937946532 %forward: 22.88505118050061 %backward: 61.23504916435775 [2025-04-08 21:50:49,226] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28246.06 | forward: 145976.05 | backward_microstep: 390614.18 | backward: 390597.81 | backward_inner_microstep: 390578.80 | backward_inner: 390570.00 | backward_allreduce_microstep: 8.92 | backward_allreduce: 2.99 | reduce_tied_grads: 0.32 | comms: 18.41 | reduce_grads: 0.22 | step: 357.83 | _step_clipping: 0.13 | _step_step: 355.99 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.053 | iteration 20510/ 143000 | elapsed time per iteration (ms): 63787.2 | learning rate: 5.741E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.351006E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 22:01:24,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=20520, skipped=22, lr=[0.0005740600629070911, 0.0005740600629070911], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20520 loss: 2.3555 iter time (s): 63.477 samples/sec: 16.132 %comms: 0.002918500695126222 %optimizer_step 0.05750279818949187 %forward: 22.953881428567655 %backward: 61.505398190014105 [2025-04-08 22:01:24,005] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25578.97 | forward: 145704.92 | backward_microstep: 390434.50 | backward: 390419.33 | backward_inner_microstep: 390400.95 | backward_inner: 390393.73 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.95 | reduce_tied_grads: 0.53 | comms: 18.53 | reduce_grads: 0.27 | step: 365.01 | _step_clipping: 0.14 | _step_step: 362.97 | _step_zero_grad: 0.60 | _step_check_overflow: 0.63 samples/sec: 16.132 | iteration 20520/ 143000 | elapsed time per iteration (ms): 63477.9 | learning rate: 5.741E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.352271E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 22:11:58,044] [INFO] [logging.py:60:log_dist] [Rank 0] step=20530, skipped=22, lr=[0.0005740332475191269, 0.0005740332475191269], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20530 loss: 2.3596 iter time (s): 63.403 samples/sec: 16.151 %comms: 0.002875951375110367 %optimizer_step 0.05733082210959444 %forward: 22.993121705084572 %backward: 61.61313547954583 [2025-04-08 22:11:58,044] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24537.06 | forward: 145784.06 | backward_microstep: 390663.96 | backward: 390647.84 | backward_inner_microstep: 390628.10 | backward_inner: 390620.92 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.87 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.21 | step: 363.50 | _step_clipping: 0.14 | _step_step: 361.51 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 16.150 | iteration 20530/ 143000 | elapsed time per iteration (ms): 63404.0 | learning rate: 5.740E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.363170E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 22:22:40,349] [INFO] [logging.py:60:log_dist] [Rank 0] step=20540, skipped=22, lr=[0.0005740064189050893, 0.0005740064189050893], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20540 loss: 2.3476 iter time (s): 64.230 samples/sec: 15.943 %comms: 0.0028170042422097988 %optimizer_step 0.05504205056634865 %forward: 22.70527152729911 %backward: 60.81912879050634 [2025-04-08 22:22:40,349] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32774.12 | forward: 145835.70 | backward_microstep: 390658.88 | backward: 390640.57 | backward_inner_microstep: 390622.46 | backward_inner: 390615.16 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.89 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.20 | step: 353.53 | _step_clipping: 0.12 | _step_step: 351.81 | _step_zero_grad: 0.51 | _step_check_overflow: 0.49 samples/sec: 15.943 | iteration 20540/ 143000 | elapsed time per iteration (ms): 64230.5 | learning rate: 5.740E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.354049E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 22:33:13,315] [INFO] [logging.py:60:log_dist] [Rank 0] step=20550, skipped=22, lr=[0.0005739795770662732, 0.0005739795770662732], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20550 loss: 2.3590 iter time (s): 63.296 samples/sec: 16.178 %comms: 0.0029035387981998814 %optimizer_step 0.056433344913500154 %forward: 22.970136691004626 %backward: 61.659946704905764 [2025-04-08 22:33:13,316] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24283.57 | forward: 145391.92 | backward_microstep: 390294.41 | backward: 390283.19 | backward_inner_microstep: 390265.48 | backward_inner: 390258.76 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.92 | reduce_tied_grads: 0.36 | comms: 18.38 | reduce_grads: 0.22 | step: 357.20 | _step_clipping: 0.14 | _step_step: 355.21 | _step_zero_grad: 0.53 | _step_check_overflow: 0.69 samples/sec: 16.178 | iteration 20550/ 143000 | elapsed time per iteration (ms): 63296.6 | learning rate: 5.740E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.356685E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 22:43:59,135] [INFO] [logging.py:60:log_dist] [Rank 0] step=20560, skipped=22, lr=[0.0005739527220039742, 0.0005739527220039742], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20560 loss: 2.3511 iter time (s): 64.581 samples/sec: 15.856 %comms: 0.0028003403813105883 %optimizer_step 0.0547965410783632 %forward: 22.553586173930317 %backward: 60.44675180471525 [2025-04-08 22:43:59,136] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36785.41 | forward: 145654.32 | backward_microstep: 390387.58 | backward: 390373.86 | backward_inner_microstep: 390354.32 | backward_inner: 390347.62 | backward_allreduce_microstep: 10.18 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.21 | step: 353.88 | _step_clipping: 0.13 | _step_step: 352.09 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 15.856 | iteration 20560/ 143000 | elapsed time per iteration (ms): 64582.0 | learning rate: 5.740E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.356154E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 22:54:36,828] [INFO] [logging.py:60:log_dist] [Rank 0] step=20570, skipped=22, lr=[0.0005739258537194883, 0.0005739258537194883], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20570 loss: 2.3519 iter time (s): 63.769 samples/sec: 16.058 %comms: 0.002837414501380685 %optimizer_step 0.05555273319596502 %forward: 22.824679101144906 %backward: 61.21191086166201 [2025-04-08 22:54:36,829] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28783.78 | forward: 145550.02 | backward_microstep: 390351.55 | backward: 390340.42 | backward_inner_microstep: 390322.86 | backward_inner: 390314.46 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.21 | step: 354.25 | _step_clipping: 0.15 | _step_step: 352.44 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.058 | iteration 20570/ 143000 | elapsed time per iteration (ms): 63769.3 | learning rate: 5.739E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.364776E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 23:05:14,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=20580, skipped=22, lr=[0.0005738989722141125, 0.0005738989722141125], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20580 loss: 2.3418 iter time (s): 63.814 samples/sec: 16.047 %comms: 0.0028158926049488254 %optimizer_step 0.05626730210236655 %forward: 22.847952563530487 %backward: 61.178863364555845 [2025-04-08 23:05:14,976] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28901.35 | forward: 145802.19 | backward_microstep: 390421.42 | backward: 390407.51 | backward_inner_microstep: 390389.13 | backward_inner: 390382.07 | backward_allreduce_microstep: 8.69 | backward_allreduce: 2.99 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.19 | step: 359.06 | _step_clipping: 0.11 | _step_step: 357.37 | _step_zero_grad: 0.52 | _step_check_overflow: 0.49 samples/sec: 16.046 | iteration 20580/ 143000 | elapsed time per iteration (ms): 63814.7 | learning rate: 5.739E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.350574E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 23:15:55,132] [INFO] [logging.py:60:log_dist] [Rank 0] step=20590, skipped=22, lr=[0.000573872077489144, 0.000573872077489144], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20590 loss: 2.3654 iter time (s): 64.015 samples/sec: 15.996 %comms: 0.0029063095337516264 %optimizer_step 0.057315236683337774 %forward: 22.782288143142136 %backward: 60.97649231604388 [2025-04-08 23:15:55,133] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30928.70 | forward: 145840.93 | backward_microstep: 390354.24 | backward: 390341.33 | backward_inner_microstep: 390322.97 | backward_inner: 390316.11 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.00 | reduce_tied_grads: 0.37 | comms: 18.60 | reduce_grads: 0.25 | step: 366.90 | _step_clipping: 0.14 | _step_step: 364.49 | _step_zero_grad: 0.64 | _step_check_overflow: 0.92 samples/sec: 15.996 | iteration 20590/ 143000 | elapsed time per iteration (ms): 64015.7 | learning rate: 5.739E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.354282E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 23:26:28,420] [INFO] [logging.py:60:log_dist] [Rank 0] step=20600, skipped=22, lr=[0.000573845169545881, 0.000573845169545881], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20600 loss: 2.3429 iter time (s): 63.328 samples/sec: 16.170 %comms: 0.0029316260941064055 %optimizer_step 0.0577485049778952 %forward: 22.975256253153017 %backward: 61.64705950792902 [2025-04-08 23:26:28,421] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24352.83 | forward: 145497.82 | backward_microstep: 390413.37 | backward: 390398.81 | backward_inner_microstep: 390381.30 | backward_inner: 390374.39 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.82 | reduce_tied_grads: 0.34 | comms: 18.57 | reduce_grads: 0.22 | step: 365.71 | _step_clipping: 0.15 | _step_step: 363.65 | _step_zero_grad: 0.55 | _step_check_overflow: 0.70 samples/sec: 16.170 | iteration 20600/ 143000 | elapsed time per iteration (ms): 63328.7 | learning rate: 5.738E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.350159E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 23:36:59,652] [INFO] [logging.py:60:log_dist] [Rank 0] step=20610, skipped=22, lr=[0.0005738182483856221, 0.0005738182483856221], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20610 loss: 2.3735 iter time (s): 63.123 samples/sec: 16.222 %comms: 0.002878622755736978 %optimizer_step 0.05684650115196071 %forward: 23.041885141054443 %backward: 61.84277977483965 [2025-04-08 23:36:59,653] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22400.51 | forward: 145446.21 | backward_microstep: 390384.44 | backward: 390367.27 | backward_inner_microstep: 390349.21 | backward_inner: 390342.13 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.94 | reduce_tied_grads: 0.35 | comms: 18.17 | reduce_grads: 0.23 | step: 358.83 | _step_clipping: 0.13 | _step_step: 356.98 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.222 | iteration 20610/ 143000 | elapsed time per iteration (ms): 63123.2 | learning rate: 5.738E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.358993E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 23:47:32,173] [INFO] [logging.py:60:log_dist] [Rank 0] step=20620, skipped=22, lr=[0.0005737913140096668, 0.0005737913140096668], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20620 loss: 2.3467 iter time (s): 63.252 samples/sec: 16.189 %comms: 0.0028576001640648707 %optimizer_step 0.056577136044324496 %forward: 22.969928159205892 %backward: 61.67400633025814 [2025-04-08 23:47:32,173] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24157.60 | forward: 145288.25 | backward_microstep: 390107.90 | backward: 390097.37 | backward_inner_microstep: 390080.25 | backward_inner: 390073.63 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.78 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.20 | step: 357.86 | _step_clipping: 0.13 | _step_step: 356.14 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.189 | iteration 20620/ 143000 | elapsed time per iteration (ms): 63252.1 | learning rate: 5.738E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.371092E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-08 23:58:07,435] [INFO] [logging.py:60:log_dist] [Rank 0] step=20630, skipped=22, lr=[0.0005737643664193149, 0.0005737643664193149], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20630 loss: 2.3447 iter time (s): 63.526 samples/sec: 16.119 %comms: 0.0029147754724636343 %optimizer_step 0.056896050919249076 %forward: 22.9443197879778 %backward: 61.45424610796116 [2025-04-08 23:58:07,436] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26074.68 | forward: 145755.29 | backward_microstep: 390405.75 | backward: 390392.12 | backward_inner_microstep: 390374.12 | backward_inner: 390367.32 | backward_allreduce_microstep: 8.57 | backward_allreduce: 3.10 | reduce_tied_grads: 0.37 | comms: 18.52 | reduce_grads: 0.22 | step: 361.44 | _step_clipping: 0.14 | _step_step: 359.57 | _step_zero_grad: 0.56 | _step_check_overflow: 0.50 samples/sec: 16.119 | iteration 20630/ 143000 | elapsed time per iteration (ms): 63526.3 | learning rate: 5.738E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.351626E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 00:08:40,247] [INFO] [logging.py:60:log_dist] [Rank 0] step=20640, skipped=22, lr=[0.0005737374056158671, 0.0005737374056158671], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20640 loss: 2.3426 iter time (s): 63.281 samples/sec: 16.182 %comms: 0.0028579794590012997 %optimizer_step 0.056549834650859186 %forward: 23.005973619766774 %backward: 61.67645213989624 [2025-04-09 00:08:40,248] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23916.43 | forward: 145583.30 | backward_microstep: 390306.08 | backward: 390292.60 | backward_inner_microstep: 390275.02 | backward_inner: 390268.33 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.88 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.23 | step: 357.85 | _step_clipping: 0.14 | _step_step: 356.03 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.182 | iteration 20640/ 143000 | elapsed time per iteration (ms): 63281.2 | learning rate: 5.737E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.347679E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 00:19:18,315] [INFO] [logging.py:60:log_dist] [Rank 0] step=20650, skipped=22, lr=[0.0005737104316006247, 0.0005737104316006247], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20650 loss: 2.3402 iter time (s): 63.806 samples/sec: 16.049 %comms: 0.0028607808258254096 %optimizer_step 0.05619268400664291 %forward: 22.798841664496543 %backward: 61.15341666133737 [2025-04-09 00:19:18,316] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29393.20 | forward: 145470.82 | backward_microstep: 390206.88 | backward: 390196.92 | backward_inner_microstep: 390179.66 | backward_inner: 390173.12 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.83 | reduce_tied_grads: 0.32 | comms: 18.25 | reduce_grads: 0.20 | step: 358.54 | _step_clipping: 0.12 | _step_step: 356.83 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.048 | iteration 20650/ 143000 | elapsed time per iteration (ms): 63806.8 | learning rate: 5.737E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.350616E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 00:29:58,568] [INFO] [logging.py:60:log_dist] [Rank 0] step=20660, skipped=22, lr=[0.0005736834443748894, 0.0005736834443748894], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20660 loss: 2.3621 iter time (s): 64.025 samples/sec: 15.994 %comms: 0.0028276361661847484 %optimizer_step 0.05614105095704052 %forward: 22.81036403217691 %backward: 60.98468412749271 [2025-04-09 00:29:58,569] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30714.95 | forward: 146042.53 | backward_microstep: 390466.21 | backward: 390452.23 | backward_inner_microstep: 390433.65 | backward_inner: 390426.41 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.22 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.21 | step: 359.44 | _step_clipping: 0.13 | _step_step: 357.55 | _step_zero_grad: 0.57 | _step_check_overflow: 0.57 samples/sec: 15.994 | iteration 20660/ 143000 | elapsed time per iteration (ms): 64025.3 | learning rate: 5.737E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.363321E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 00:40:37,139] [INFO] [logging.py:60:log_dist] [Rank 0] step=20670, skipped=22, lr=[0.0005736564439399639, 0.0005736564439399639], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20670 loss: 2.3651 iter time (s): 63.856 samples/sec: 16.036 %comms: 0.00286009834205629 %optimizer_step 0.056743136918987906 %forward: 22.826602017852977 %backward: 61.13694605547321 [2025-04-09 00:40:37,140] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29376.98 | forward: 145762.62 | backward_microstep: 390411.65 | backward: 390398.96 | backward_inner_microstep: 390379.86 | backward_inner: 390373.18 | backward_allreduce_microstep: 9.94 | backward_allreduce: 4.51 | reduce_tied_grads: 0.34 | comms: 18.26 | reduce_grads: 0.21 | step: 362.34 | _step_clipping: 0.13 | _step_step: 360.40 | _step_zero_grad: 0.56 | _step_check_overflow: 0.63 samples/sec: 16.036 | iteration 20670/ 143000 | elapsed time per iteration (ms): 63857.1 | learning rate: 5.737E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.357377E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 00:51:17,038] [INFO] [logging.py:60:log_dist] [Rank 0] step=20680, skipped=22, lr=[0.0005736294302971512, 0.0005736294302971512], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20680 loss: 2.3737 iter time (s): 63.989 samples/sec: 16.003 %comms: 0.0028490935394124128 %optimizer_step 0.05601812625151423 %forward: 22.769999011124185 %backward: 61.024771295431215 [2025-04-09 00:51:17,039] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30650.65 | forward: 145703.66 | backward_microstep: 390507.56 | backward: 390493.31 | backward_inner_microstep: 390473.61 | backward_inner: 390466.64 | backward_allreduce_microstep: 10.09 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 18.23 | reduce_grads: 0.21 | step: 358.46 | _step_clipping: 0.11 | _step_step: 356.50 | _step_zero_grad: 0.55 | _step_check_overflow: 0.63 samples/sec: 16.003 | iteration 20680/ 143000 | elapsed time per iteration (ms): 63990.0 | learning rate: 5.736E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.373367E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 01:01:48,463] [INFO] [logging.py:60:log_dist] [Rank 0] step=20690, skipped=22, lr=[0.0005736024034477553, 0.0005736024034477553], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20690 loss: 2.3644 iter time (s): 63.142 samples/sec: 16.217 %comms: 0.002906741269448861 %optimizer_step 0.058063724808179715 %forward: 23.07116348838625 %backward: 61.86160394212069 [2025-04-09 01:01:48,463] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22056.34 | forward: 145675.58 | backward_microstep: 390621.85 | backward: 390605.58 | backward_inner_microstep: 390586.94 | backward_inner: 390579.66 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.17 | reduce_tied_grads: 0.32 | comms: 18.35 | reduce_grads: 0.22 | step: 366.63 | _step_clipping: 0.13 | _step_step: 364.84 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.217 | iteration 20690/ 143000 | elapsed time per iteration (ms): 63142.4 | learning rate: 5.736E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.366692E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 01:12:17,660] [INFO] [logging.py:60:log_dist] [Rank 0] step=20700, skipped=22, lr=[0.0005735753633930806, 0.0005735753633930806], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20700 loss: 2.3585 iter time (s): 62.919 samples/sec: 16.275 %comms: 0.002934951025713148 %optimizer_step 0.05789853454300015 %forward: 23.148516522213658 %backward: 62.03951618682463 [2025-04-09 01:12:17,661] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20154.97 | forward: 145648.58 | backward_microstep: 390359.94 | backward: 390347.58 | backward_inner_microstep: 390329.55 | backward_inner: 390322.83 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.99 | reduce_tied_grads: 0.33 | comms: 18.47 | reduce_grads: 0.21 | step: 364.29 | _step_clipping: 0.12 | _step_step: 362.30 | _step_zero_grad: 0.56 | _step_check_overflow: 0.65 samples/sec: 16.275 | iteration 20700/ 143000 | elapsed time per iteration (ms): 62919.8 | learning rate: 5.736E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.360962E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 01:22:57,881] [INFO] [logging.py:60:log_dist] [Rank 0] step=20710, skipped=22, lr=[0.000573548310134432, 0.000573548310134432], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20710 loss: 2.3384 iter time (s): 64.021 samples/sec: 15.995 %comms: 0.0028637899908869956 %optimizer_step 0.05619355533236713 %forward: 22.756554253740926 %backward: 61.031428866960205 [2025-04-09 01:22:57,882] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30699.96 | forward: 145690.68 | backward_microstep: 390748.97 | backward: 390731.84 | backward_inner_microstep: 390712.97 | backward_inner: 390705.66 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.01 | reduce_tied_grads: 0.36 | comms: 18.33 | reduce_grads: 0.21 | step: 359.76 | _step_clipping: 0.13 | _step_step: 357.63 | _step_zero_grad: 0.73 | _step_check_overflow: 0.60 samples/sec: 15.994 | iteration 20710/ 143000 | elapsed time per iteration (ms): 64022.1 | learning rate: 5.735E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.355528E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 01:33:28,073] [INFO] [logging.py:60:log_dist] [Rank 0] step=20720, skipped=22, lr=[0.0005735212436731153, 0.0005735212436731153], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20720 loss: 2.3498 iter time (s): 63.019 samples/sec: 16.249 %comms: 0.0028808358481781926 %optimizer_step 0.05577413365894085 %forward: 23.07591845796591 %backward: 61.935258938075165 [2025-04-09 01:33:28,074] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21430.97 | forward: 145421.18 | backward_microstep: 390319.53 | backward: 390307.26 | backward_inner_microstep: 390289.13 | backward_inner: 390282.41 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.23 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.19 | step: 351.48 | _step_clipping: 0.11 | _step_step: 349.42 | _step_zero_grad: 0.65 | _step_check_overflow: 0.73 samples/sec: 16.249 | iteration 20720/ 143000 | elapsed time per iteration (ms): 63019.1 | learning rate: 5.735E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.353755E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 01:43:57,518] [INFO] [logging.py:60:log_dist] [Rank 0] step=20730, skipped=22, lr=[0.000573494164010437, 0.000573494164010437], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20730 loss: 2.3557 iter time (s): 62.944 samples/sec: 16.268 %comms: 0.0028850086297632195 %optimizer_step 0.05788607736057802 %forward: 23.61543120261648 %backward: 62.009846561043666 [2025-04-09 01:43:57,519] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17427.31 | forward: 148644.90 | backward_microstep: 390325.53 | backward: 390314.59 | backward_inner_microstep: 390297.37 | backward_inner: 390290.55 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.84 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.21 | step: 364.36 | _step_clipping: 0.12 | _step_step: 362.59 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.268 | iteration 20730/ 143000 | elapsed time per iteration (ms): 62944.5 | learning rate: 5.735E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.348077E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 01:54:38,495] [INFO] [logging.py:60:log_dist] [Rank 0] step=20740, skipped=22, lr=[0.0005734670711477039, 0.0005734670711477039], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20740 loss: 2.3438 iter time (s): 64.097 samples/sec: 15.976 %comms: 0.0028909486853119326 %optimizer_step 0.05691059813407665 %forward: 22.815794003032728 %backward: 60.94818159329554 [2025-04-09 01:54:38,496] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30972.82 | forward: 146242.53 | backward_microstep: 390677.24 | backward: 390659.91 | backward_inner_microstep: 390641.12 | backward_inner: 390633.67 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.99 | reduce_tied_grads: 0.32 | comms: 18.53 | reduce_grads: 0.21 | step: 364.78 | _step_clipping: 0.13 | _step_step: 362.75 | _step_zero_grad: 0.57 | _step_check_overflow: 0.68 samples/sec: 15.976 | iteration 20740/ 143000 | elapsed time per iteration (ms): 64097.7 | learning rate: 5.735E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.346962E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 02:05:23,981] [INFO] [logging.py:60:log_dist] [Rank 0] step=20750, skipped=22, lr=[0.0005734399650862236, 0.0005734399650862236], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20750 loss: 2.3646 iter time (s): 64.548 samples/sec: 15.864 %comms: 0.002839654281910791 %optimizer_step 0.05515557223039378 %forward: 22.586639596844723 %backward: 60.48080845754868 [2025-04-09 02:05:23,981] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36267.16 | forward: 145792.09 | backward_microstep: 390403.80 | backward: 390391.13 | backward_inner_microstep: 390373.08 | backward_inner: 390366.23 | backward_allreduce_microstep: 8.70 | backward_allreduce: 3.15 | reduce_tied_grads: 0.32 | comms: 18.33 | reduce_grads: 0.22 | step: 356.02 | _step_clipping: 0.15 | _step_step: 354.24 | _step_zero_grad: 0.55 | _step_check_overflow: 0.46 samples/sec: 15.864 | iteration 20750/ 143000 | elapsed time per iteration (ms): 64548.5 | learning rate: 5.734E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.372032E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 02:16:02,300] [INFO] [logging.py:60:log_dist] [Rank 0] step=20760, skipped=22, lr=[0.0005734128458273046, 0.0005734128458273046], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20760 loss: 2.3551 iter time (s): 63.831 samples/sec: 16.042 %comms: 0.00284897251887346 %optimizer_step 0.057275422298013974 %forward: 22.85173855650435 %backward: 61.19332557083599 [2025-04-09 02:16:02,301] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28725.18 | forward: 145865.74 | backward_microstep: 390621.43 | backward: 390605.28 | backward_inner_microstep: 390585.85 | backward_inner: 390578.70 | backward_allreduce_microstep: 9.95 | backward_allreduce: 4.49 | reduce_tied_grads: 0.32 | comms: 18.19 | reduce_grads: 0.21 | step: 365.60 | _step_clipping: 0.13 | _step_step: 363.68 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.042 | iteration 20760/ 143000 | elapsed time per iteration (ms): 63832.0 | learning rate: 5.734E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.354730E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 02:26:48,349] [INFO] [logging.py:60:log_dist] [Rank 0] step=20770, skipped=22, lr=[0.0005733857133722555, 0.0005733857133722555], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20770 loss: 2.3542 iter time (s): 64.604 samples/sec: 15.850 %comms: 0.0027862128063757124 %optimizer_step 0.055931732993092724 %forward: 22.56720021230928 %backward: 60.41062311104852 [2025-04-09 02:26:48,350] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36947.08 | forward: 145793.76 | backward_microstep: 390291.60 | backward: 390278.45 | backward_inner_microstep: 390261.22 | backward_inner: 390253.08 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.81 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.22 | step: 361.34 | _step_clipping: 0.18 | _step_step: 359.46 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 15.850 | iteration 20770/ 143000 | elapsed time per iteration (ms): 64604.9 | learning rate: 5.734E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.349803E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 02:37:28,495] [INFO] [logging.py:60:log_dist] [Rank 0] step=20780, skipped=22, lr=[0.0005733585677223861, 0.0005733585677223861], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20780 loss: 2.3525 iter time (s): 64.014 samples/sec: 15.996 %comms: 0.0028177878153535572 %optimizer_step 0.05420762886128585 %forward: 22.718343459043197 %backward: 60.95780415745769 [2025-04-09 02:37:28,496] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31458.23 | forward: 145429.28 | backward_microstep: 390227.05 | backward: 390215.48 | backward_inner_microstep: 390199.01 | backward_inner: 390192.59 | backward_allreduce_microstep: 7.80 | backward_allreduce: 2.69 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.19 | step: 347.00 | _step_clipping: 0.11 | _step_step: 345.27 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 15.996 | iteration 20780/ 143000 | elapsed time per iteration (ms): 64014.6 | learning rate: 5.734E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.346858E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 02:48:07,700] [INFO] [logging.py:60:log_dist] [Rank 0] step=20790, skipped=22, lr=[0.0005733314088790063, 0.0005733314088790063], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20790 loss: 2.3464 iter time (s): 63.920 samples/sec: 16.020 %comms: 0.0028083223547378505 %optimizer_step 0.05698233801382484 %forward: 22.76724113170547 %backward: 61.053299868770694 [2025-04-09 02:48:07,701] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30386.81 | forward: 145528.02 | backward_microstep: 390264.15 | backward: 390252.19 | backward_inner_microstep: 390234.52 | backward_inner: 390227.52 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.90 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.20 | step: 364.23 | _step_clipping: 0.11 | _step_step: 362.44 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.020 | iteration 20790/ 143000 | elapsed time per iteration (ms): 63920.5 | learning rate: 5.733E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.353660E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 02:58:42,566] [INFO] [logging.py:60:log_dist] [Rank 0] step=20800, skipped=22, lr=[0.0005733042368434272, 0.0005733042368434272], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20800 loss: 2.3529 iter time (s): 63.486 samples/sec: 16.130 %comms: 0.00285061254010031 %optimizer_step 0.05672852648850716 %forward: 22.93667691379493 %backward: 61.500166090823605 [2025-04-09 02:58:42,567] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25696.35 | forward: 145615.80 | backward_microstep: 390453.63 | backward: 390440.00 | backward_inner_microstep: 390422.65 | backward_inner: 390416.00 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.77 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.21 | step: 360.15 | _step_clipping: 0.13 | _step_step: 358.22 | _step_zero_grad: 0.53 | _step_check_overflow: 0.67 samples/sec: 16.129 | iteration 20800/ 143000 | elapsed time per iteration (ms): 63486.6 | learning rate: 5.733E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.352915E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 03:09:19,631] [INFO] [logging.py:60:log_dist] [Rank 0] step=20810, skipped=22, lr=[0.00057327705161696, 0.00057327705161696], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20810 loss: 2.3734 iter time (s): 63.706 samples/sec: 16.074 %comms: 0.0028156219673746564 %optimizer_step 0.05495084806672877 %forward: 22.855193959662216 %backward: 61.244650844081285 [2025-04-09 03:09:19,632] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28280.00 | forward: 145601.15 | backward_microstep: 390174.56 | backward: 390164.78 | backward_inner_microstep: 390145.89 | backward_inner: 390139.52 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.80 | reduce_tied_grads: 0.29 | comms: 17.94 | reduce_grads: 0.19 | step: 350.07 | _step_clipping: 0.14 | _step_step: 348.27 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.074 | iteration 20810/ 143000 | elapsed time per iteration (ms): 63706.5 | learning rate: 5.733E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.356481E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 03:19:59,063] [INFO] [logging.py:60:log_dist] [Rank 0] step=20820, skipped=22, lr=[0.0005732498532009169, 0.0005732498532009169], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20820 loss: 2.3310 iter time (s): 63.943 samples/sec: 16.014 %comms: 0.002841556277266297 %optimizer_step 0.06102442426403268 %forward: 22.780819800652893 %backward: 61.049201498282166 [2025-04-09 03:19:59,064] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30321.65 | forward: 145666.42 | backward_microstep: 390377.35 | backward: 390364.29 | backward_inner_microstep: 390346.30 | backward_inner: 390339.35 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.92 | reduce_tied_grads: 0.34 | comms: 18.17 | reduce_grads: 0.22 | step: 390.21 | _step_clipping: 0.17 | _step_step: 388.20 | _step_zero_grad: 0.55 | _step_check_overflow: 0.65 samples/sec: 16.014 | iteration 20820/ 143000 | elapsed time per iteration (ms): 63943.2 | learning rate: 5.732E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.346884E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 03:30:32,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=20830, skipped=22, lr=[0.0005732226415966107, 0.0005732226415966107], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20830 loss: 2.3590 iter time (s): 63.390 samples/sec: 16.154 %comms: 0.002895139134359567 %optimizer_step 0.05675894415059968 %forward: 22.987660484478017 %backward: 61.60087789419626 [2025-04-09 03:30:32,970] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24665.10 | forward: 145718.65 | backward_microstep: 390503.85 | backward: 390487.61 | backward_inner_microstep: 390469.72 | backward_inner: 390462.91 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.91 | reduce_tied_grads: 0.35 | comms: 18.35 | reduce_grads: 0.21 | step: 359.79 | _step_clipping: 0.12 | _step_step: 357.90 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 16.154 | iteration 20830/ 143000 | elapsed time per iteration (ms): 63390.6 | learning rate: 5.732E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.353098E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 03:41:07,551] [INFO] [logging.py:60:log_dist] [Rank 0] step=20840, skipped=22, lr=[0.0005731954168053544, 0.0005731954168053544], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20840 loss: 2.3508 iter time (s): 63.458 samples/sec: 16.137 %comms: 0.00288036779065553 %optimizer_step 0.058582557033686825 %forward: 22.96963530431965 %backward: 61.539412032203074 [2025-04-09 03:41:07,552] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25214.20 | forward: 145759.79 | backward_microstep: 390533.64 | backward: 390514.32 | backward_inner_microstep: 390496.60 | backward_inner: 390486.37 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.87 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.21 | step: 371.75 | _step_clipping: 0.11 | _step_step: 369.84 | _step_zero_grad: 0.52 | _step_check_overflow: 0.68 samples/sec: 16.137 | iteration 20840/ 143000 | elapsed time per iteration (ms): 63458.2 | learning rate: 5.732E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.353464E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 03:51:44,119] [INFO] [logging.py:60:log_dist] [Rank 0] step=20850, skipped=22, lr=[0.0005731681788284624, 0.0005731681788284624], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20850 loss: 2.3530 iter time (s): 63.656 samples/sec: 16.086 %comms: 0.002852615901067189 %optimizer_step 0.05597180496211703 %forward: 22.920759663730852 %backward: 61.382018346093744 [2025-04-09 03:51:44,120] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26817.32 | forward: 145904.89 | backward_microstep: 390752.33 | backward: 390734.72 | backward_inner_microstep: 390714.29 | backward_inner: 390705.42 | backward_allreduce_microstep: 10.40 | backward_allreduce: 3.15 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.22 | step: 356.30 | _step_clipping: 0.12 | _step_step: 354.45 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.086 | iteration 20850/ 143000 | elapsed time per iteration (ms): 63656.8 | learning rate: 5.732E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.352214E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 04:02:21,665] [INFO] [logging.py:60:log_dist] [Rank 0] step=20860, skipped=22, lr=[0.0005731409276672492, 0.0005731409276672492], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20860 loss: 2.3754 iter time (s): 63.754 samples/sec: 16.062 %comms: 0.002889196287985381 %optimizer_step 0.057715601940580154 %forward: 22.862888042494266 %backward: 61.28678123611534 [2025-04-09 04:02:21,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27928.94 | forward: 145759.74 | backward_microstep: 390745.32 | backward: 390726.91 | backward_inner_microstep: 390708.32 | backward_inner: 390700.92 | backward_allreduce_microstep: 8.61 | backward_allreduce: 3.10 | reduce_tied_grads: 0.35 | comms: 18.42 | reduce_grads: 0.23 | step: 367.96 | _step_clipping: 0.17 | _step_step: 365.98 | _step_zero_grad: 0.56 | _step_check_overflow: 0.58 samples/sec: 16.062 | iteration 20860/ 143000 | elapsed time per iteration (ms): 63754.5 | learning rate: 5.731E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.360985E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 04:13:01,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=20870, skipped=22, lr=[0.00057311366332303, 0.00057311366332303], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20870 loss: 2.3587 iter time (s): 63.933 samples/sec: 16.017 %comms: 0.0028476835999157544 %optimizer_step 0.056455974382688526 %forward: 22.798330515600096 %backward: 61.068310810876625 [2025-04-09 04:13:01,002] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30072.27 | forward: 145756.76 | backward_microstep: 390442.15 | backward: 390428.55 | backward_inner_microstep: 390410.46 | backward_inner: 390403.49 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.21 | step: 360.94 | _step_clipping: 0.11 | _step_step: 359.08 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 16.017 | iteration 20870/ 143000 | elapsed time per iteration (ms): 63933.7 | learning rate: 5.731E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.364830E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 04:23:44,370] [INFO] [logging.py:60:log_dist] [Rank 0] step=20880, skipped=22, lr=[0.0005730863857971207, 0.0005730863857971207], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20880 loss: 2.3330 iter time (s): 64.336 samples/sec: 15.916 %comms: 0.002808417386188514 %optimizer_step 0.054188810730235605 %forward: 22.657152254727634 %backward: 60.68277783675073 [2025-04-09 04:23:44,371] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34102.27 | forward: 145767.70 | backward_microstep: 390424.17 | backward: 390410.44 | backward_inner_microstep: 390392.46 | backward_inner: 390385.57 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.95 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.19 | step: 348.63 | _step_clipping: 0.13 | _step_step: 346.83 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 15.916 | iteration 20880/ 143000 | elapsed time per iteration (ms): 64336.9 | learning rate: 5.731E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.344103E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 04:34:24,345] [INFO] [logging.py:60:log_dist] [Rank 0] step=20890, skipped=22, lr=[0.0005730590950908379, 0.0005730590950908379], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20890 loss: 2.3639 iter time (s): 63.997 samples/sec: 16.001 %comms: 0.0028400382557284593 %optimizer_step 0.05641987296646315 %forward: 22.738482916543454 %backward: 61.00006103588047 [2025-04-09 04:34:24,346] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31024.05 | forward: 145519.23 | backward_microstep: 390398.20 | backward: 390381.46 | backward_inner_microstep: 390363.84 | backward_inner: 390357.15 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.89 | reduce_tied_grads: 0.33 | comms: 18.18 | reduce_grads: 0.21 | step: 361.07 | _step_clipping: 0.12 | _step_step: 359.08 | _step_zero_grad: 0.58 | _step_check_overflow: 0.66 samples/sec: 16.001 | iteration 20890/ 143000 | elapsed time per iteration (ms): 63997.5 | learning rate: 5.731E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.352267E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 04:45:01,549] [INFO] [logging.py:60:log_dist] [Rank 0] step=20900, skipped=22, lr=[0.0005730317912054987, 0.0005730317912054987], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20900 loss: 2.3460 iter time (s): 63.720 samples/sec: 16.070 %comms: 0.002848350834781361 %optimizer_step 0.05756343984579359 %forward: 22.862334054112207 %backward: 61.273204337026165 [2025-04-09 04:45:01,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28047.54 | forward: 145678.17 | backward_microstep: 390444.87 | backward: 390431.20 | backward_inner_microstep: 390413.60 | backward_inner: 390406.66 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.84 | reduce_tied_grads: 0.36 | comms: 18.15 | reduce_grads: 0.21 | step: 366.79 | _step_clipping: 0.11 | _step_step: 364.89 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 16.070 | iteration 20900/ 143000 | elapsed time per iteration (ms): 63720.4 | learning rate: 5.730E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.363563E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 04:55:32,316] [INFO] [logging.py:60:log_dist] [Rank 0] step=20910, skipped=22, lr=[0.000573004474142421, 0.000573004474142421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20910 loss: 2.3718 iter time (s): 63.076 samples/sec: 16.234 %comms: 0.002914119285736194 %optimizer_step 0.0570279642565973 %forward: 23.05415391335507 %backward: 61.908367160274345 [2025-04-09 04:55:32,317] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21775.91 | forward: 145416.54 | backward_microstep: 390505.74 | backward: 390493.64 | backward_inner_microstep: 390475.93 | backward_inner: 390469.23 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.89 | reduce_tied_grads: 0.32 | comms: 18.38 | reduce_grads: 0.22 | step: 359.71 | _step_clipping: 0.13 | _step_step: 357.60 | _step_zero_grad: 0.61 | _step_check_overflow: 0.71 samples/sec: 16.234 | iteration 20910/ 143000 | elapsed time per iteration (ms): 63076.7 | learning rate: 5.730E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.363270E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 05:06:12,907] [INFO] [logging.py:60:log_dist] [Rank 0] step=20920, skipped=22, lr=[0.000572977143902923, 0.000572977143902923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20920 loss: 2.3490 iter time (s): 64.058 samples/sec: 15.985 %comms: 0.0028204489177457376 %optimizer_step 0.05618566490141154 %forward: 22.74396493084335 %backward: 60.96966126395099 [2025-04-09 05:06:12,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31241.88 | forward: 145694.32 | backward_microstep: 390576.52 | backward: 390562.23 | backward_inner_microstep: 390544.15 | backward_inner: 390537.43 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.97 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.21 | step: 359.92 | _step_clipping: 0.12 | _step_step: 357.95 | _step_zero_grad: 0.57 | _step_check_overflow: 0.67 samples/sec: 15.985 | iteration 20920/ 143000 | elapsed time per iteration (ms): 64059.1 | learning rate: 5.730E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.365826E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 05:16:51,141] [INFO] [logging.py:60:log_dist] [Rank 0] step=20930, skipped=22, lr=[0.0005729498004883242, 0.0005729498004883242], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20930 loss: 2.3548 iter time (s): 63.823 samples/sec: 16.044 %comms: 0.0028252194838300653 %optimizer_step 0.05629810819741066 %forward: 22.80879575739918 %backward: 61.18457391953807 [2025-04-09 05:16:51,142] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29053.87 | forward: 145572.26 | backward_microstep: 390508.87 | backward: 390497.45 | backward_inner_microstep: 390480.99 | backward_inner: 390474.67 | backward_allreduce_microstep: 7.76 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.21 | step: 359.31 | _step_clipping: 0.12 | _step_step: 357.40 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.044 | iteration 20930/ 143000 | elapsed time per iteration (ms): 63823.4 | learning rate: 5.729E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.353112E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 05:27:29,433] [INFO] [logging.py:60:log_dist] [Rank 0] step=20940, skipped=22, lr=[0.0005729224438999439, 0.0005729224438999439], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20940 loss: 2.3641 iter time (s): 63.829 samples/sec: 16.043 %comms: 0.0028189498265535112 %optimizer_step 0.05619454783435743 %forward: 22.845673745826698 %backward: 61.18495408352709 [2025-04-09 05:27:29,434] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28839.57 | forward: 145820.83 | backward_microstep: 390547.36 | backward: 390535.24 | backward_inner_microstep: 390516.04 | backward_inner: 390507.78 | backward_allreduce_microstep: 10.10 | backward_allreduce: 2.80 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.21 | step: 358.68 | _step_clipping: 0.13 | _step_step: 356.97 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.043 | iteration 20940/ 143000 | elapsed time per iteration (ms): 63829.2 | learning rate: 5.729E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.366064E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 05:38:02,651] [INFO] [logging.py:60:log_dist] [Rank 0] step=20950, skipped=22, lr=[0.0005728950741391029, 0.0005728950741391029], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20950 loss: 2.3455 iter time (s): 63.321 samples/sec: 16.172 %comms: 0.0028672573172812627 %optimizer_step 0.05658393058767756 %forward: 22.982487754246005 %backward: 61.658957213134954 [2025-04-09 05:38:02,652] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24174.68 | forward: 145527.84 | backward_microstep: 390443.12 | backward: 390431.83 | backward_inner_microstep: 390414.65 | backward_inner: 390408.19 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.85 | reduce_tied_grads: 0.42 | comms: 18.16 | reduce_grads: 0.20 | step: 358.30 | _step_clipping: 0.12 | _step_step: 356.52 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.171 | iteration 20950/ 143000 | elapsed time per iteration (ms): 63321.7 | learning rate: 5.729E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.354437E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 05:48:41,777] [INFO] [logging.py:60:log_dist] [Rank 0] step=20960, skipped=22, lr=[0.0005728676912071216, 0.0005728676912071216], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20960 loss: 2.3351 iter time (s): 63.912 samples/sec: 16.022 %comms: 0.002823442117856938 %optimizer_step 0.05589767792643722 %forward: 22.77955156147766 %backward: 61.08457581489469 [2025-04-09 05:48:41,778] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30042.47 | forward: 145588.70 | backward_microstep: 390416.73 | backward: 390403.82 | backward_inner_microstep: 390384.74 | backward_inner: 390374.01 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.80 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.20 | step: 357.25 | _step_clipping: 0.14 | _step_step: 355.37 | _step_zero_grad: 0.51 | _step_check_overflow: 0.63 samples/sec: 16.022 | iteration 20960/ 143000 | elapsed time per iteration (ms): 63912.6 | learning rate: 5.729E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.346231E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 05:59:13,268] [INFO] [logging.py:60:log_dist] [Rank 0] step=20970, skipped=22, lr=[0.0005728402951053221, 0.0005728402951053221], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20970 loss: 2.3625 iter time (s): 63.149 samples/sec: 16.216 %comms: 0.0028404754154831 %optimizer_step 0.056249258759055176 %forward: 23.034719209081224 %backward: 61.77214355531084 [2025-04-09 05:59:13,269] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22900.34 | forward: 145460.86 | backward_microstep: 390090.57 | backward: 390081.98 | backward_inner_microstep: 390065.51 | backward_inner: 390059.29 | backward_allreduce_microstep: 7.95 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.19 | step: 355.21 | _step_clipping: 0.12 | _step_step: 353.42 | _step_zero_grad: 0.49 | _step_check_overflow: 0.60 samples/sec: 16.216 | iteration 20970/ 143000 | elapsed time per iteration (ms): 63149.1 | learning rate: 5.728E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.358126E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 06:09:45,691] [INFO] [logging.py:60:log_dist] [Rank 0] step=20980, skipped=22, lr=[0.0005728128858350264, 0.0005728128858350264], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20980 loss: 2.3455 iter time (s): 63.242 samples/sec: 16.192 %comms: 0.0028932530453005707 %optimizer_step 0.05683262064412333 %forward: 22.967109808894445 %backward: 61.71004921069243 [2025-04-09 06:09:45,692] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23854.56 | forward: 145247.98 | backward_microstep: 390275.69 | backward: 390265.05 | backward_inner_microstep: 390248.37 | backward_inner: 390241.92 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.74 | reduce_tied_grads: 0.34 | comms: 18.30 | reduce_grads: 0.21 | step: 359.42 | _step_clipping: 0.13 | _step_step: 357.61 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 16.192 | iteration 20980/ 143000 | elapsed time per iteration (ms): 63242.3 | learning rate: 5.728E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.353300E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 06:20:19,962] [INFO] [logging.py:60:log_dist] [Rank 0] step=20990, skipped=22, lr=[0.0005727854633975577, 0.0005727854633975577], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 20990 loss: 2.3479 iter time (s): 63.427 samples/sec: 16.145 %comms: 0.0028524945114120704 %optimizer_step 0.058201676277186226 %forward: 22.948725290420413 %backward: 61.53106434203186 [2025-04-09 06:20:19,963] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25352.78 | forward: 145555.89 | backward_microstep: 390279.38 | backward: 390270.43 | backward_inner_microstep: 390253.34 | backward_inner: 390246.84 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.98 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.23 | step: 369.15 | _step_clipping: 0.16 | _step_step: 367.16 | _step_zero_grad: 0.51 | _step_check_overflow: 0.70 samples/sec: 16.145 | iteration 20990/ 143000 | elapsed time per iteration (ms): 63427.2 | learning rate: 5.728E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.347311E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 06:31:02,252] [INFO] [logging.py:60:log_dist] [Rank 0] step=21000, skipped=22, lr=[0.0005727580277942393, 0.0005727580277942393], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21000 loss: 2.3611 iter time (s): 64.228 samples/sec: 15.943 %comms: 0.0028216395635861963 %optimizer_step 0.057235558919390965 %forward: 22.681108097074414 %backward: 60.78685287742134 [2025-04-09 06:31:02,253] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33079.21 | forward: 145676.90 | backward_microstep: 390437.21 | backward: 390423.63 | backward_inner_microstep: 390405.48 | backward_inner: 390398.55 | backward_allreduce_microstep: 8.61 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.22 | step: 367.61 | _step_clipping: 0.12 | _step_step: 365.62 | _step_zero_grad: 0.60 | _step_check_overflow: 0.63 samples/sec: 15.943 | iteration 21000/ 143000 | elapsed time per iteration (ms): 64229.0 | learning rate: 5.728E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.347638E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 06:31:05,079] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step21000/mp_rank_00_model_states.pt [2025-04-09 06:31:19,189] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-09 06:31:19,195] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step21000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-09 06:41:53,120] [INFO] [logging.py:60:log_dist] [Rank 0] step=21010, skipped=22, lr=[0.0005727305790263953, 0.0005727305790263953], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21010 loss: 2.3428 iter time (s): 63.391 samples/sec: 16.154 %comms: 0.00286101308088436 %optimizer_step 0.056858107587059274 %forward: 22.99139741557248 %backward: 61.61797202958237 [2025-04-09 06:41:53,121] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24410.32 | forward: 145744.89 | backward_microstep: 390621.52 | backward: 390602.81 | backward_inner_microstep: 390584.94 | backward_inner: 390578.07 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.83 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.21 | step: 360.43 | _step_clipping: 0.17 | _step_step: 358.39 | _step_zero_grad: 0.54 | _step_check_overflow: 0.70 samples/sec: 15.733 | iteration 21010/ 143000 | elapsed time per iteration (ms): 65086.8 | learning rate: 5.727E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.342903E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 06:52:29,238] [INFO] [logging.py:60:log_dist] [Rank 0] step=21020, skipped=22, lr=[0.0005727031170953506, 0.0005727031170953506], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21020 loss: 2.3602 iter time (s): 63.611 samples/sec: 16.098 %comms: 0.0028413681507467956 %optimizer_step 0.05737263104191245 %forward: 22.890141168883957 %backward: 61.36494804300151 [2025-04-09 06:52:29,238] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27074.71 | forward: 145606.86 | backward_microstep: 390362.53 | backward: 390349.59 | backward_inner_microstep: 390329.43 | backward_inner: 390322.66 | backward_allreduce_microstep: 10.86 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 18.07 | reduce_grads: 0.22 | step: 364.95 | _step_clipping: 0.13 | _step_step: 362.97 | _step_zero_grad: 0.70 | _step_check_overflow: 0.56 samples/sec: 16.098 | iteration 21020/ 143000 | elapsed time per iteration (ms): 63611.8 | learning rate: 5.727E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.351759E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 07:03:08,265] [INFO] [logging.py:60:log_dist] [Rank 0] step=21030, skipped=22, lr=[0.0005726756420024306, 0.0005726756420024306], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21030 loss: 2.3488 iter time (s): 63.902 samples/sec: 16.024 %comms: 0.002814512593858325 %optimizer_step 0.05415187000638337 %forward: 22.770047048255275 %backward: 61.06859067069941 [2025-04-09 07:03:08,266] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30208.03 | forward: 145505.52 | backward_microstep: 390252.70 | backward: 390241.50 | backward_inner_microstep: 390224.68 | backward_inner: 390218.23 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.19 | step: 346.04 | _step_clipping: 0.11 | _step_step: 344.33 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.024 | iteration 21030/ 143000 | elapsed time per iteration (ms): 63902.7 | learning rate: 5.727E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.350377E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 07:13:42,226] [INFO] [logging.py:60:log_dist] [Rank 0] step=21040, skipped=22, lr=[0.0005726481537489615, 0.0005726481537489615], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21040 loss: 2.3460 iter time (s): 63.396 samples/sec: 16.153 %comms: 0.002838244090569824 %optimizer_step 0.05631020014004812 %forward: 23.566356967824255 %backward: 61.559475928986465 [2025-04-09 07:13:42,227] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21201.40 | forward: 149400.28 | backward_microstep: 390270.96 | backward: 390259.84 | backward_inner_microstep: 390242.03 | backward_inner: 390235.20 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.93 | reduce_tied_grads: 0.29 | comms: 17.99 | reduce_grads: 0.20 | step: 356.98 | _step_clipping: 0.11 | _step_step: 355.22 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.152 | iteration 21040/ 143000 | elapsed time per iteration (ms): 63396.1 | learning rate: 5.726E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.354159E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 07:24:28,758] [INFO] [logging.py:60:log_dist] [Rank 0] step=21050, skipped=22, lr=[0.0005726206523362699, 0.0005726206523362699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21050 loss: 2.3570 iter time (s): 64.653 samples/sec: 15.839 %comms: 0.0027889991139346494 %optimizer_step 0.055482017015694295 %forward: 22.54395304217323 %backward: 60.37535501371807 [2025-04-09 07:24:28,758] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37322.43 | forward: 145752.46 | backward_microstep: 390354.07 | backward: 390342.21 | backward_inner_microstep: 390324.15 | backward_inner: 390317.10 | backward_allreduce_microstep: 8.80 | backward_allreduce: 3.21 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.21 | step: 358.71 | _step_clipping: 0.13 | _step_step: 357.00 | _step_zero_grad: 0.52 | _step_check_overflow: 0.46 samples/sec: 15.838 | iteration 21050/ 143000 | elapsed time per iteration (ms): 64653.1 | learning rate: 5.726E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.348372E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 07:35:01,967] [INFO] [logging.py:60:log_dist] [Rank 0] step=21060, skipped=22, lr=[0.000572593137765683, 0.000572593137765683], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21060 loss: 2.3496 iter time (s): 63.320 samples/sec: 16.172 %comms: 0.002848993740475156 %optimizer_step 0.05637931378180765 %forward: 22.9700948497584 %backward: 61.63432673332792 [2025-04-09 07:35:01,968] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24408.18 | forward: 145447.56 | backward_microstep: 390284.74 | backward: 390271.01 | backward_inner_microstep: 390251.74 | backward_inner: 390243.04 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.21 | step: 357.00 | _step_clipping: 0.12 | _step_step: 355.20 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 16.172 | iteration 21060/ 143000 | elapsed time per iteration (ms): 63321.0 | learning rate: 5.726E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.348392E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 07:45:45,402] [INFO] [logging.py:60:log_dist] [Rank 0] step=21070, skipped=22, lr=[0.0005725656100385291, 0.0005725656100385291], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21070 loss: 2.3533 iter time (s): 64.343 samples/sec: 15.915 %comms: 0.0028093149914446046 %optimizer_step 0.055524063921865895 %forward: 22.66768019836716 %backward: 60.66239345484566 [2025-04-09 07:45:45,403] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34172.45 | forward: 145850.39 | backward_microstep: 390330.15 | backward: 390319.34 | backward_inner_microstep: 390298.84 | backward_inner: 390291.89 | backward_allreduce_microstep: 10.82 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.22 | step: 357.26 | _step_clipping: 0.13 | _step_step: 355.40 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 15.915 | iteration 21070/ 143000 | elapsed time per iteration (ms): 64343.5 | learning rate: 5.726E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.346949E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 07:56:16,774] [INFO] [logging.py:60:log_dist] [Rank 0] step=21080, skipped=22, lr=[0.0005725380691561366, 0.0005725380691561366], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21080 loss: 2.3503 iter time (s): 63.137 samples/sec: 16.219 %comms: 0.0029269253708214663 %optimizer_step 0.05728951624585662 %forward: 23.05397127995877 %backward: 61.832896940727444 [2025-04-09 07:56:16,774] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22312.39 | forward: 145554.73 | backward_microstep: 390404.19 | backward: 390391.34 | backward_inner_microstep: 390373.27 | backward_inner: 390366.24 | backward_allreduce_microstep: 8.64 | backward_allreduce: 3.01 | reduce_tied_grads: 0.32 | comms: 18.48 | reduce_grads: 0.21 | step: 361.71 | _step_clipping: 0.14 | _step_step: 359.73 | _step_zero_grad: 0.55 | _step_check_overflow: 0.64 samples/sec: 16.219 | iteration 21080/ 143000 | elapsed time per iteration (ms): 63137.1 | learning rate: 5.725E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.348798E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 08:07:01,266] [INFO] [logging.py:60:log_dist] [Rank 0] step=21090, skipped=22, lr=[0.0005725105151198347, 0.0005725105151198347], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21090 loss: 2.3578 iter time (s): 64.449 samples/sec: 15.889 %comms: 0.0029704006099340965 %optimizer_step 0.05605054761737053 %forward: 22.610719259654903 %backward: 60.604992566257565 [2025-04-09 08:07:01,267] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35030.88 | forward: 145722.95 | backward_microstep: 390612.80 | backward: 390590.77 | backward_inner_microstep: 390572.51 | backward_inner: 390565.20 | backward_allreduce_microstep: 8.70 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 19.14 | reduce_grads: 0.21 | step: 361.24 | _step_clipping: 0.13 | _step_step: 359.27 | _step_zero_grad: 0.57 | _step_check_overflow: 0.58 samples/sec: 15.888 | iteration 21090/ 143000 | elapsed time per iteration (ms): 64449.2 | learning rate: 5.725E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.350427E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 08:17:37,524] [INFO] [logging.py:60:log_dist] [Rank 0] step=21100, skipped=22, lr=[0.0005724829479309534, 0.0005724829479309534], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21100 loss: 2.3816 iter time (s): 63.625 samples/sec: 16.094 %comms: 0.0034598991992625537 %optimizer_step 0.057092870945037774 %forward: 22.92737911816135 %backward: 61.41994618375837 [2025-04-09 08:17:37,525] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26375.25 | forward: 145875.82 | backward_microstep: 390803.76 | backward: 390785.39 | backward_inner_microstep: 390765.18 | backward_inner: 390757.99 | backward_allreduce_microstep: 10.45 | backward_allreduce: 3.12 | reduce_tied_grads: 0.34 | comms: 22.01 | reduce_grads: 0.22 | step: 363.25 | _step_clipping: 0.12 | _step_step: 361.27 | _step_zero_grad: 0.60 | _step_check_overflow: 0.60 samples/sec: 16.094 | iteration 21100/ 143000 | elapsed time per iteration (ms): 63625.8 | learning rate: 5.725E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.379192E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 08:28:13,682] [INFO] [logging.py:60:log_dist] [Rank 0] step=21110, skipped=22, lr=[0.0005724553675908233, 0.0005724553675908233], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21110 loss: 2.3494 iter time (s): 63.615 samples/sec: 16.097 %comms: 0.0028736458978106654 %optimizer_step 0.05835676785785099 %forward: 22.95987252224211 %backward: 61.40072060934977 [2025-04-09 08:28:13,682] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26270.41 | forward: 146059.60 | backward_microstep: 390620.87 | backward: 390601.67 | backward_inner_microstep: 390580.62 | backward_inner: 390573.46 | backward_allreduce_microstep: 10.88 | backward_allreduce: 4.92 | reduce_tied_grads: 0.35 | comms: 18.28 | reduce_grads: 0.24 | step: 371.24 | _step_clipping: 0.16 | _step_step: 369.39 | _step_zero_grad: 0.54 | _step_check_overflow: 0.52 samples/sec: 16.097 | iteration 21110/ 143000 | elapsed time per iteration (ms): 63615.8 | learning rate: 5.725E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.356374E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 08:38:50,304] [INFO] [logging.py:60:log_dist] [Rank 0] step=21120, skipped=22, lr=[0.0005724277741007754, 0.0005724277741007754], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21120 loss: 2.3350 iter time (s): 63.662 samples/sec: 16.085 %comms: 0.0028644675858466237 %optimizer_step 0.05683664554615986 %forward: 22.895359508918727 %backward: 61.34246946217963 [2025-04-09 08:38:50,305] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27118.81 | forward: 145755.72 | backward_microstep: 390531.46 | backward: 390516.50 | backward_inner_microstep: 390497.96 | backward_inner: 390490.90 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.03 | reduce_tied_grads: 0.33 | comms: 18.24 | reduce_grads: 0.20 | step: 361.83 | _step_clipping: 0.13 | _step_step: 359.98 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 16.085 | iteration 21120/ 143000 | elapsed time per iteration (ms): 63662.3 | learning rate: 5.724E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.351131E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 08:49:45,361] [INFO] [logging.py:60:log_dist] [Rank 0] step=21130, skipped=22, lr=[0.0005724001674621413, 0.0005724001674621413], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21130 loss: 2.3463 iter time (s): 65.505 samples/sec: 15.632 %comms: 0.0027479708763757126 %optimizer_step 0.05460760446140766 %forward: 22.235064725602292 %backward: 59.57607276767518 [2025-04-09 08:49:45,361] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 45940.50 | forward: 145650.95 | backward_microstep: 390264.86 | backward: 390253.49 | backward_inner_microstep: 390235.75 | backward_inner: 390229.01 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.91 | reduce_tied_grads: 0.29 | comms: 18.00 | reduce_grads: 0.21 | step: 357.71 | _step_clipping: 0.14 | _step_step: 355.97 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 15.632 | iteration 21130/ 143000 | elapsed time per iteration (ms): 65505.6 | learning rate: 5.724E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.353684E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:00:21,497] [INFO] [logging.py:60:log_dist] [Rank 0] step=21140, skipped=22, lr=[0.0005723725476762539, 0.0005723725476762539], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21140 loss: 2.3598 iter time (s): 63.613 samples/sec: 16.097 %comms: 0.002865271668788651 %optimizer_step 0.05685744069755577 %forward: 22.90721013999603 %backward: 61.383726869964775 [2025-04-09 09:00:21,498] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26628.02 | forward: 145719.71 | backward_microstep: 390495.35 | backward: 390480.50 | backward_inner_microstep: 390460.83 | backward_inner: 390453.69 | backward_allreduce_microstep: 10.04 | backward_allreduce: 4.50 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.22 | step: 361.69 | _step_clipping: 0.14 | _step_step: 359.82 | _step_zero_grad: 0.55 | _step_check_overflow: 0.52 samples/sec: 16.097 | iteration 21140/ 143000 | elapsed time per iteration (ms): 63613.6 | learning rate: 5.724E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.348373E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:10:52,771] [INFO] [logging.py:60:log_dist] [Rank 0] step=21150, skipped=22, lr=[0.0005723449147444458, 0.0005723449147444458], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21150 loss: 2.3379 iter time (s): 63.127 samples/sec: 16.221 %comms: 0.002888929041376395 %optimizer_step 0.05884001892846581 %forward: 23.03311871485994 %backward: 61.82865894535359 [2025-04-09 09:10:52,772] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22285.40 | forward: 145400.64 | backward_microstep: 390316.96 | backward: 390304.35 | backward_inner_microstep: 390285.79 | backward_inner: 390278.76 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.96 | reduce_tied_grads: 0.35 | comms: 18.24 | reduce_grads: 0.22 | step: 371.44 | _step_clipping: 0.14 | _step_step: 369.45 | _step_zero_grad: 0.57 | _step_check_overflow: 0.63 samples/sec: 16.221 | iteration 21150/ 143000 | elapsed time per iteration (ms): 63127.4 | learning rate: 5.723E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.347153E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:21:30,433] [INFO] [logging.py:60:log_dist] [Rank 0] step=21160, skipped=22, lr=[0.0005723172686680509, 0.0005723172686680509], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21160 loss: 2.3588 iter time (s): 63.766 samples/sec: 16.059 %comms: 0.0032401307857041965 %optimizer_step 0.05695197825785423 %forward: 22.840503214169132 %backward: 61.192535112269255 [2025-04-09 09:21:30,433] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28525.68 | forward: 145643.76 | backward_microstep: 390208.96 | backward: 390197.66 | backward_inner_microstep: 390179.51 | backward_inner: 390172.62 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 20.66 | reduce_grads: 0.22 | step: 363.16 | _step_clipping: 0.13 | _step_step: 361.21 | _step_zero_grad: 0.55 | _step_check_overflow: 0.66 samples/sec: 16.059 | iteration 21160/ 143000 | elapsed time per iteration (ms): 63766.2 | learning rate: 5.723E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.362736E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:32:05,767] [INFO] [logging.py:60:log_dist] [Rank 0] step=21170, skipped=22, lr=[0.0005722896094484037, 0.0005722896094484037], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21170 loss: 2.3635 iter time (s): 63.533 samples/sec: 16.118 %comms: 0.0028563928028703376 %optimizer_step 0.058730101933587214 %forward: 22.892657575802662 %backward: 61.42252906083028 [2025-04-09 09:32:05,768] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26328.47 | forward: 145443.51 | backward_microstep: 390246.96 | backward: 390234.64 | backward_inner_microstep: 390216.54 | backward_inner: 390209.52 | backward_allreduce_microstep: 8.75 | backward_allreduce: 2.92 | reduce_tied_grads: 0.34 | comms: 18.15 | reduce_grads: 0.22 | step: 373.13 | _step_clipping: 0.14 | _step_step: 371.16 | _step_zero_grad: 0.54 | _step_check_overflow: 0.68 samples/sec: 16.117 | iteration 21170/ 143000 | elapsed time per iteration (ms): 63533.4 | learning rate: 5.723E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.357503E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:42:39,907] [INFO] [logging.py:60:log_dist] [Rank 0] step=21180, skipped=22, lr=[0.0005722619370868388, 0.0005722619370868388], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21180 loss: 2.3760 iter time (s): 63.413 samples/sec: 16.148 %comms: 0.002990546336774722 %optimizer_step 0.0599708666924555 %forward: 22.99263163949802 %backward: 61.618467781220396 [2025-04-09 09:42:39,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24126.79 | forward: 145803.95 | backward_microstep: 390762.35 | backward: 390743.26 | backward_inner_microstep: 390724.73 | backward_inner: 390717.13 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.90 | reduce_tied_grads: 0.37 | comms: 18.96 | reduce_grads: 0.22 | step: 380.30 | _step_clipping: 0.14 | _step_step: 377.93 | _step_zero_grad: 0.66 | _step_check_overflow: 0.84 samples/sec: 16.148 | iteration 21180/ 143000 | elapsed time per iteration (ms): 63414.1 | learning rate: 5.723E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.354890E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:53:15,429] [INFO] [logging.py:60:log_dist] [Rank 0] step=21190, skipped=22, lr=[0.000572234251584692, 0.000572234251584692], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21190 loss: 2.3377 iter time (s): 63.552 samples/sec: 16.113 %comms: 0.0029126521057933935 %optimizer_step 0.056534255487838754 %forward: 22.934754204949208 %backward: 61.45687595957341 [2025-04-09 09:53:15,429] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25754.18 | forward: 145753.81 | backward_microstep: 390584.12 | backward: 390567.68 | backward_inner_microstep: 390547.59 | backward_inner: 390540.33 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.51 | reduce_grads: 0.21 | step: 359.28 | _step_clipping: 0.12 | _step_step: 357.51 | _step_zero_grad: 0.53 | _step_check_overflow: 0.50 samples/sec: 16.113 | iteration 21190/ 143000 | elapsed time per iteration (ms): 63552.1 | learning rate: 5.722E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.350922E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 09:54:21,295] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-09 09:55:24,100] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-09 10:04:01,993] [INFO] [logging.py:60:log_dist] [Rank 0] step=21200, skipped=24, lr=[0.0005722120937226534, 0.0005722120937226534], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21200 loss: 2.3653 iter time (s): 64.656 samples/sec: 15.838 %comms: 0.002264348817375298 %optimizer_step 0.04851811501105176 %forward: 22.57809216773035 %backward: 60.409779020367694 [2025-04-09 10:04:01,994] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36620.88 | forward: 145980.49 | backward_microstep: 390600.93 | backward: 390584.33 | backward_inner_microstep: 390565.48 | backward_inner: 390556.26 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.06 | reduce_tied_grads: 0.32 | comms: 14.64 | reduce_grads: 0.20 | step: 313.70 | _step_clipping: 0.12 | _step_step: 311.87 | _step_zero_grad: 0.55 | _step_check_overflow: 0.51 samples/sec: 15.838 | iteration 21200/ 143000 | elapsed time per iteration (ms): 64656.4 | learning rate: 5.722E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.349087E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-09 10:14:34,900] [INFO] [logging.py:60:log_dist] [Rank 0] step=21210, skipped=24, lr=[0.0005721843845708269, 0.0005721843845708269], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21210 loss: 2.3719 iter time (s): 63.290 samples/sec: 16.179 %comms: 0.002881662042009575 %optimizer_step 0.058031044008509065 %forward: 23.022242524883772 %backward: 61.709563093876454 [2025-04-09 10:14:34,901] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23254.00 | forward: 145708.00 | backward_microstep: 390579.34 | backward: 390560.44 | backward_inner_microstep: 390542.29 | backward_inner: 390534.98 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.24 | reduce_grads: 0.20 | step: 367.28 | _step_clipping: 0.12 | _step_step: 365.31 | _step_zero_grad: 0.53 | _step_check_overflow: 0.70 samples/sec: 16.179 | iteration 21210/ 143000 | elapsed time per iteration (ms): 63290.7 | learning rate: 5.722E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.356463E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 10:25:03,886] [INFO] [logging.py:60:log_dist] [Rank 0] step=21220, skipped=24, lr=[0.0005721566622821614, 0.0005721566622821614], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21220 loss: 2.3538 iter time (s): 62.898 samples/sec: 16.280 %comms: 0.00293988487079854 %optimizer_step 0.057982119875548126 %forward: 23.165221151819747 %backward: 62.11946547743948 [2025-04-09 10:25:03,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19183.88 | forward: 145704.45 | backward_microstep: 390736.61 | backward: 390718.60 | backward_inner_microstep: 390700.37 | backward_inner: 390692.95 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.49 | reduce_grads: 0.20 | step: 364.70 | _step_clipping: 0.14 | _step_step: 362.64 | _step_zero_grad: 0.57 | _step_check_overflow: 0.68 samples/sec: 16.280 | iteration 21220/ 143000 | elapsed time per iteration (ms): 62898.6 | learning rate: 5.722E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.351902E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 10:35:37,390] [INFO] [logging.py:60:log_dist] [Rank 0] step=21230, skipped=24, lr=[0.000572128926857995, 0.000572128926857995], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21230 loss: 2.3687 iter time (s): 63.350 samples/sec: 16.164 %comms: 0.0028855322259790633 %optimizer_step 0.05730659247932992 %forward: 23.08261747547648 %backward: 61.70974842127627 [2025-04-09 10:35:37,391] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22953.77 | forward: 146227.94 | backward_microstep: 390951.92 | backward: 390930.06 | backward_inner_microstep: 390911.08 | backward_inner: 390903.53 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.28 | reduce_grads: 0.20 | step: 363.04 | _step_clipping: 0.10 | _step_step: 361.18 | _step_zero_grad: 0.52 | _step_check_overflow: 0.63 samples/sec: 16.164 | iteration 21230/ 143000 | elapsed time per iteration (ms): 63350.4 | learning rate: 5.721E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.366373E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 10:46:13,054] [INFO] [logging.py:60:log_dist] [Rank 0] step=21240, skipped=24, lr=[0.0005721011782996663, 0.0005721011782996663], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21240 loss: 2.3455 iter time (s): 63.566 samples/sec: 16.109 %comms: 0.0029270006382146395 %optimizer_step 0.0574639636432269 %forward: 22.93892969424095 %backward: 61.45201862856209 [2025-04-09 10:46:13,054] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25850.85 | forward: 145813.11 | backward_microstep: 390643.72 | backward: 390624.58 | backward_inner_microstep: 390605.59 | backward_inner: 390598.06 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.04 | reduce_tied_grads: 0.38 | comms: 18.61 | reduce_grads: 0.35 | step: 365.27 | _step_clipping: 0.13 | _step_step: 363.19 | _step_zero_grad: 0.55 | _step_check_overflow: 0.76 samples/sec: 16.109 | iteration 21240/ 143000 | elapsed time per iteration (ms): 63566.4 | learning rate: 5.721E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.343296E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 10:56:45,218] [INFO] [logging.py:60:log_dist] [Rank 0] step=21250, skipped=24, lr=[0.0005720734166085145, 0.0005720734166085145], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21250 loss: 2.3690 iter time (s): 63.216 samples/sec: 16.198 %comms: 0.0028667550762401448 %optimizer_step 0.05538350099099696 %forward: 23.05743195478339 %backward: 61.76068615634719 [2025-04-09 10:56:45,219] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22666.15 | forward: 145759.51 | backward_microstep: 390439.60 | backward: 390425.40 | backward_inner_microstep: 390408.08 | backward_inner: 390401.26 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.22 | step: 350.11 | _step_clipping: 0.13 | _step_step: 348.27 | _step_zero_grad: 0.47 | _step_check_overflow: 0.65 samples/sec: 16.198 | iteration 21250/ 143000 | elapsed time per iteration (ms): 63216.4 | learning rate: 5.721E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.360642E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 11:07:24,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=21260, skipped=24, lr=[0.0005720456417858797, 0.0005720456417858797], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21260 loss: 2.3489 iter time (s): 63.886 samples/sec: 16.029 %comms: 0.00281707002786612 %optimizer_step 0.055668043009295794 %forward: 22.768651488883254 %backward: 61.073541510965725 [2025-04-09 11:07:24,081] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29945.70 | forward: 145458.94 | backward_microstep: 390183.32 | backward: 390172.10 | backward_inner_microstep: 390155.01 | backward_inner: 390148.50 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.21 | step: 355.64 | _step_clipping: 0.12 | _step_step: 353.69 | _step_zero_grad: 0.51 | _step_check_overflow: 0.73 samples/sec: 16.028 | iteration 21260/ 143000 | elapsed time per iteration (ms): 63886.2 | learning rate: 5.720E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.358856E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 11:18:08,880] [INFO] [logging.py:60:log_dist] [Rank 0] step=21270, skipped=24, lr=[0.0005720178538331024, 0.0005720178538331024], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21270 loss: 2.3637 iter time (s): 64.479 samples/sec: 15.881 %comms: 0.0028316945002172873 %optimizer_step 0.05707468066190353 %forward: 22.609498751810715 %backward: 60.52120502790147 [2025-04-09 11:18:08,881] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35502.38 | forward: 145784.49 | backward_microstep: 390247.96 | backward: 390236.57 | backward_inner_microstep: 390217.80 | backward_inner: 390210.66 | backward_allreduce_microstep: 9.09 | backward_allreduce: 3.17 | reduce_tied_grads: 0.36 | comms: 18.26 | reduce_grads: 0.24 | step: 368.01 | _step_clipping: 0.14 | _step_step: 365.95 | _step_zero_grad: 0.62 | _step_check_overflow: 0.63 samples/sec: 15.881 | iteration 21270/ 143000 | elapsed time per iteration (ms): 64480.0 | learning rate: 5.720E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.356567E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 11:28:50,353] [INFO] [logging.py:60:log_dist] [Rank 0] step=21280, skipped=24, lr=[0.0005719900527515237, 0.0005719900527515237], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21280 loss: 2.3592 iter time (s): 64.147 samples/sec: 15.963 %comms: 0.0028549620936592166 %optimizer_step 0.057299501409469024 %forward: 22.75593470445941 %backward: 60.8683334444542 [2025-04-09 11:28:50,354] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31736.77 | forward: 145971.86 | backward_microstep: 390465.08 | backward: 390450.41 | backward_inner_microstep: 390430.92 | backward_inner: 390423.73 | backward_allreduce_microstep: 9.38 | backward_allreduce: 3.24 | reduce_tied_grads: 0.34 | comms: 18.31 | reduce_grads: 0.24 | step: 367.56 | _step_clipping: 0.14 | _step_step: 365.60 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 15.963 | iteration 21280/ 143000 | elapsed time per iteration (ms): 64147.3 | learning rate: 5.720E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.355040E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 11:39:31,942] [INFO] [logging.py:60:log_dist] [Rank 0] step=21290, skipped=24, lr=[0.0005719622385424853, 0.0005719622385424853], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21290 loss: 2.3637 iter time (s): 64.158 samples/sec: 15.961 %comms: 0.002879499737572991 %optimizer_step 0.056238446679453824 %forward: 22.736874410984054 %backward: 60.86687745584312 [2025-04-09 11:39:31,943] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31930.90 | forward: 145875.60 | backward_microstep: 390526.54 | backward: 390510.69 | backward_inner_microstep: 390489.10 | backward_inner: 390481.84 | backward_allreduce_microstep: 9.42 | backward_allreduce: 3.24 | reduce_tied_grads: 0.36 | comms: 18.47 | reduce_grads: 0.24 | step: 360.82 | _step_clipping: 0.15 | _step_step: 358.80 | _step_zero_grad: 0.60 | _step_check_overflow: 0.58 samples/sec: 15.960 | iteration 21290/ 143000 | elapsed time per iteration (ms): 64158.9 | learning rate: 5.720E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.359181E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 11:50:08,998] [INFO] [logging.py:60:log_dist] [Rank 0] step=21300, skipped=24, lr=[0.0005719344112073299, 0.0005719344112073299], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21300 loss: 2.3474 iter time (s): 63.705 samples/sec: 16.074 %comms: 0.0028607248930096357 %optimizer_step 0.05895549136208857 %forward: 22.85977279011215 %backward: 61.24240139130826 [2025-04-09 11:50:08,999] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28022.22 | forward: 145628.11 | backward_microstep: 390155.01 | backward: 390144.53 | backward_inner_microstep: 390125.93 | backward_inner: 390118.95 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.08 | reduce_tied_grads: 0.35 | comms: 18.22 | reduce_grads: 0.28 | step: 375.58 | _step_clipping: 0.14 | _step_step: 373.56 | _step_zero_grad: 0.60 | _step_check_overflow: 0.61 samples/sec: 16.074 | iteration 21300/ 143000 | elapsed time per iteration (ms): 63705.6 | learning rate: 5.719E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.352151E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 12:00:42,500] [INFO] [logging.py:60:log_dist] [Rank 0] step=21310, skipped=24, lr=[0.0005719065707474003, 0.0005719065707474003], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21310 loss: 2.3380 iter time (s): 63.349 samples/sec: 16.164 %comms: 0.002907338960284848 %optimizer_step 0.05604261850756665 %forward: 23.020299838868727 %backward: 61.649700458297076 [2025-04-09 12:00:42,501] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23847.52 | forward: 145832.36 | backward_microstep: 390561.73 | backward: 390547.52 | backward_inner_microstep: 390529.21 | backward_inner: 390521.88 | backward_allreduce_microstep: 8.73 | backward_allreduce: 2.98 | reduce_tied_grads: 0.36 | comms: 18.42 | reduce_grads: 0.22 | step: 355.03 | _step_clipping: 0.13 | _step_step: 353.09 | _step_zero_grad: 0.56 | _step_check_overflow: 0.60 samples/sec: 16.164 | iteration 21310/ 143000 | elapsed time per iteration (ms): 63350.2 | learning rate: 5.719E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.344072E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 12:11:23,770] [INFO] [logging.py:60:log_dist] [Rank 0] step=21320, skipped=24, lr=[0.0005718787171640403, 0.0005718787171640403], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21320 loss: 2.3672 iter time (s): 64.126 samples/sec: 15.968 %comms: 0.0028266125521968738 %optimizer_step 0.05617621966257008 %forward: 22.7839053833844 %backward: 60.935048020223334 [2025-04-09 12:11:23,771] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31129.85 | forward: 146104.68 | backward_microstep: 390772.18 | backward: 390753.71 | backward_inner_microstep: 390734.20 | backward_inner: 390725.24 | backward_allreduce_microstep: 9.23 | backward_allreduce: 3.31 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.22 | step: 360.24 | _step_clipping: 0.13 | _step_step: 358.36 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 15.968 | iteration 21320/ 143000 | elapsed time per iteration (ms): 64127.0 | learning rate: 5.719E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.351024E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 12:22:07,511] [INFO] [logging.py:60:log_dist] [Rank 0] step=21330, skipped=24, lr=[0.0005718508504585944, 0.0005718508504585944], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21330 loss: 2.3405 iter time (s): 64.373 samples/sec: 15.907 %comms: 0.0028636476064403203 %optimizer_step 0.05712354571583702 %forward: 22.67806101946867 %backward: 60.66674642885182 [2025-04-09 12:22:07,512] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33925.30 | forward: 145986.49 | backward_microstep: 390549.15 | backward: 390532.75 | backward_inner_microstep: 390513.11 | backward_inner: 390506.28 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.89 | reduce_tied_grads: 0.36 | comms: 18.43 | reduce_grads: 0.24 | step: 367.72 | _step_clipping: 0.13 | _step_step: 365.57 | _step_zero_grad: 0.57 | _step_check_overflow: 0.80 samples/sec: 15.907 | iteration 21330/ 143000 | elapsed time per iteration (ms): 64374.1 | learning rate: 5.719E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.357236E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 12:32:51,113] [INFO] [logging.py:60:log_dist] [Rank 0] step=21340, skipped=24, lr=[0.0005718229706324075, 0.0005718229706324075], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21340 loss: 2.3462 iter time (s): 64.360 samples/sec: 15.911 %comms: 0.0028218478538712946 %optimizer_step 0.055541140594080975 %forward: 22.641292926188473 %backward: 60.62005012563323 [2025-04-09 12:32:51,113] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34519.51 | forward: 145718.43 | backward_microstep: 390160.50 | backward: 390148.15 | backward_inner_microstep: 390130.39 | backward_inner: 390123.73 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.23 | step: 357.46 | _step_clipping: 0.14 | _step_step: 355.62 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 15.910 | iteration 21340/ 143000 | elapsed time per iteration (ms): 64360.2 | learning rate: 5.718E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.344156E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 12:43:30,452] [INFO] [logging.py:60:log_dist] [Rank 0] step=21350, skipped=24, lr=[0.0005717950776868249, 0.0005717950776868249], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21350 loss: 2.3630 iter time (s): 63.933 samples/sec: 16.017 %comms: 0.0028285046408161843 %optimizer_step 0.056682378275050314 %forward: 22.822474445535693 %backward: 61.05062630028267 [2025-04-09 12:43:30,453] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29839.32 | forward: 145911.68 | backward_microstep: 390331.01 | backward: 390316.99 | backward_inner_microstep: 390298.49 | backward_inner: 390291.40 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.04 | reduce_tied_grads: 0.36 | comms: 18.08 | reduce_grads: 0.21 | step: 362.39 | _step_clipping: 0.13 | _step_step: 360.52 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.017 | iteration 21350/ 143000 | elapsed time per iteration (ms): 63933.9 | learning rate: 5.718E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.355481E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 12:54:20,948] [INFO] [logging.py:60:log_dist] [Rank 0] step=21360, skipped=24, lr=[0.0005717671716231933, 0.0005717671716231933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21360 loss: 2.3507 iter time (s): 65.049 samples/sec: 15.742 %comms: 0.0028284129179409374 %optimizer_step 0.05508644693377198 %forward: 22.42395321335519 %backward: 60.01748766191226 [2025-04-09 12:54:20,948] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40931.64 | forward: 145865.42 | backward_microstep: 390425.44 | backward: 390407.33 | backward_inner_microstep: 390388.42 | backward_inner: 390381.45 | backward_allreduce_microstep: 8.96 | backward_allreduce: 3.02 | reduce_tied_grads: 0.35 | comms: 18.40 | reduce_grads: 0.21 | step: 358.33 | _step_clipping: 0.12 | _step_step: 356.38 | _step_zero_grad: 0.55 | _step_check_overflow: 0.62 samples/sec: 15.742 | iteration 21360/ 143000 | elapsed time per iteration (ms): 65049.6 | learning rate: 5.718E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.352899E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 13:04:58,751] [INFO] [logging.py:60:log_dist] [Rank 0] step=21370, skipped=24, lr=[0.0005717392524428593, 0.0005717392524428593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21370 loss: 2.3532 iter time (s): 63.780 samples/sec: 16.055 %comms: 0.002882829182205411 %optimizer_step 0.05800258744227071 %forward: 22.86148316270811 %backward: 61.21924767557566 [2025-04-09 13:04:58,752] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28256.12 | forward: 145809.89 | backward_microstep: 390470.87 | backward: 390454.62 | backward_inner_microstep: 390434.65 | backward_inner: 390427.11 | backward_allreduce_microstep: 9.54 | backward_allreduce: 3.38 | reduce_tied_grads: 0.34 | comms: 18.39 | reduce_grads: 0.23 | step: 369.94 | _step_clipping: 0.14 | _step_step: 367.89 | _step_zero_grad: 0.63 | _step_check_overflow: 0.61 samples/sec: 16.055 | iteration 21370/ 143000 | elapsed time per iteration (ms): 63780.3 | learning rate: 5.717E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.354069E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 13:15:44,392] [INFO] [logging.py:60:log_dist] [Rank 0] step=21380, skipped=24, lr=[0.0005717113201471704, 0.0005717113201471704], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21380 loss: 2.3499 iter time (s): 64.564 samples/sec: 15.860 %comms: 0.002801153782447528 %optimizer_step 0.05573803041730186 %forward: 22.585305236105423 %backward: 60.432668956967994 [2025-04-09 13:15:44,393] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36388.99 | forward: 145818.73 | backward_microstep: 390185.42 | backward: 390174.72 | backward_inner_microstep: 390156.66 | backward_inner: 390149.91 | backward_allreduce_microstep: 8.69 | backward_allreduce: 2.99 | reduce_tied_grads: 0.38 | comms: 18.09 | reduce_grads: 0.25 | step: 359.86 | _step_clipping: 0.14 | _step_step: 357.97 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 15.860 | iteration 21380/ 143000 | elapsed time per iteration (ms): 64564.1 | learning rate: 5.717E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.349316E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 13:26:28,570] [INFO] [logging.py:60:log_dist] [Rank 0] step=21390, skipped=24, lr=[0.0005716833747374748, 0.0005716833747374748], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21390 loss: 2.3371 iter time (s): 64.417 samples/sec: 15.896 %comms: 0.002828025658550331 %optimizer_step 0.0558967456546088 %forward: 22.659755026586414 %backward: 60.63805800986254 [2025-04-09 13:26:28,571] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34286.10 | forward: 145967.60 | backward_microstep: 390628.80 | backward: 390612.87 | backward_inner_microstep: 390594.36 | backward_inner: 390587.23 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.19 | step: 360.07 | _step_clipping: 0.13 | _step_step: 358.27 | _step_zero_grad: 0.54 | _step_check_overflow: 0.50 samples/sec: 15.896 | iteration 21390/ 143000 | elapsed time per iteration (ms): 64417.7 | learning rate: 5.717E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.344072E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 13:37:08,925] [INFO] [logging.py:60:log_dist] [Rank 0] step=21400, skipped=24, lr=[0.0005716554162151213, 0.0005716554162151213], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21400 loss: 2.3618 iter time (s): 64.035 samples/sec: 15.991 %comms: 0.002921051129201913 %optimizer_step 0.05840929429605169 %forward: 22.818033913334357 %backward: 60.98199758653875 [2025-04-09 13:37:08,925] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30449.09 | forward: 146114.81 | backward_microstep: 390512.55 | backward: 390496.98 | backward_inner_microstep: 390477.92 | backward_inner: 390470.60 | backward_allreduce_microstep: 9.01 | backward_allreduce: 3.12 | reduce_tied_grads: 0.41 | comms: 18.70 | reduce_grads: 0.25 | step: 374.02 | _step_clipping: 0.16 | _step_step: 371.82 | _step_zero_grad: 0.63 | _step_check_overflow: 0.69 samples/sec: 15.991 | iteration 21400/ 143000 | elapsed time per iteration (ms): 64035.5 | learning rate: 5.717E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.355559E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 13:47:50,330] [INFO] [logging.py:60:log_dist] [Rank 0] step=21410, skipped=24, lr=[0.0005716274445814593, 0.0005716274445814593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21410 loss: 2.3584 iter time (s): 64.140 samples/sec: 15.965 %comms: 0.0029574906578498654 %optimizer_step 0.0577718960196868 %forward: 22.77275824476537 %backward: 60.92722805653995 [2025-04-09 13:47:50,331] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31176.57 | forward: 146064.10 | backward_microstep: 390806.51 | backward: 390786.26 | backward_inner_microstep: 390767.44 | backward_inner: 390759.90 | backward_allreduce_microstep: 8.69 | backward_allreduce: 3.02 | reduce_tied_grads: 0.38 | comms: 18.97 | reduce_grads: 0.23 | step: 370.55 | _step_clipping: 0.13 | _step_step: 368.38 | _step_zero_grad: 0.57 | _step_check_overflow: 0.74 samples/sec: 15.965 | iteration 21410/ 143000 | elapsed time per iteration (ms): 64140.6 | learning rate: 5.716E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.360416E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 13:58:41,193] [INFO] [logging.py:60:log_dist] [Rank 0] step=21420, skipped=24, lr=[0.0005715994598378387, 0.0005715994598378387], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21420 loss: 2.3361 iter time (s): 65.086 samples/sec: 15.733 %comms: 0.0028286148307277896 %optimizer_step 0.05619865483060098 %forward: 22.49426360498557 %backward: 60.022014534332044 [2025-04-09 13:58:41,193] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40441.63 | forward: 146405.24 | backward_microstep: 390674.96 | backward: 390656.81 | backward_inner_microstep: 390636.62 | backward_inner: 390628.95 | backward_allreduce_microstep: 9.57 | backward_allreduce: 3.35 | reduce_tied_grads: 0.34 | comms: 18.41 | reduce_grads: 0.22 | step: 365.77 | _step_clipping: 0.14 | _step_step: 363.75 | _step_zero_grad: 0.63 | _step_check_overflow: 0.59 samples/sec: 15.733 | iteration 21420/ 143000 | elapsed time per iteration (ms): 65086.2 | learning rate: 5.716E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.342159E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 14:09:12,286] [INFO] [logging.py:60:log_dist] [Rank 0] step=21430, skipped=24, lr=[0.0005715714619856104, 0.0005715714619856104], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21430 loss: 2.3797 iter time (s): 63.109 samples/sec: 16.226 %comms: 0.002885334017647187 %optimizer_step 0.05622914331900312 %forward: 23.097573655627265 %backward: 61.83386399806932 [2025-04-09 14:09:12,286] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21843.72 | forward: 145765.89 | backward_microstep: 390237.33 | backward: 390225.75 | backward_inner_microstep: 390208.04 | backward_inner: 390201.33 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.24 | step: 354.86 | _step_clipping: 0.12 | _step_step: 353.10 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.226 | iteration 21430/ 143000 | elapsed time per iteration (ms): 63109.3 | learning rate: 5.716E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.351967E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 14:19:57,020] [INFO] [logging.py:60:log_dist] [Rank 0] step=21440, skipped=24, lr=[0.0005715434510261255, 0.0005715434510261255], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21440 loss: 2.3426 iter time (s): 64.473 samples/sec: 15.883 %comms: 0.0028266867369474225 %optimizer_step 0.05586584545264162 %forward: 22.688735408548045 %backward: 60.58268830563117 [2025-04-09 14:19:57,021] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34551.26 | forward: 146280.93 | backward_microstep: 390611.09 | backward: 390594.35 | backward_inner_microstep: 390574.04 | backward_inner: 390567.10 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.10 | reduce_tied_grads: 0.35 | comms: 18.22 | reduce_grads: 0.22 | step: 360.18 | _step_clipping: 0.14 | _step_step: 358.12 | _step_zero_grad: 0.54 | _step_check_overflow: 0.71 samples/sec: 15.882 | iteration 21440/ 143000 | elapsed time per iteration (ms): 64473.5 | learning rate: 5.715E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.355554E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 14:30:31,620] [INFO] [logging.py:60:log_dist] [Rank 0] step=21450, skipped=24, lr=[0.0005715154269607361, 0.0005715154269607361], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21450 loss: 2.3590 iter time (s): 63.459 samples/sec: 16.136 %comms: 0.00290381147264489 %optimizer_step 0.05804966722348701 %forward: 22.99013001693953 %backward: 61.51017899214518 [2025-04-09 14:30:31,620] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25046.17 | forward: 145893.66 | backward_microstep: 390351.85 | backward: 390339.03 | backward_inner_microstep: 390321.01 | backward_inner: 390314.06 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.43 | reduce_grads: 0.24 | step: 368.38 | _step_clipping: 0.13 | _step_step: 366.51 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.136 | iteration 21450/ 143000 | elapsed time per iteration (ms): 63459.9 | learning rate: 5.715E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.356043E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 14:41:06,007] [INFO] [logging.py:60:log_dist] [Rank 0] step=21460, skipped=24, lr=[0.0005714873897907947, 0.0005714873897907947], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21460 loss: 2.3580 iter time (s): 63.438 samples/sec: 16.142 %comms: 0.0028839174659208936 %optimizer_step 0.055228137561192193 %forward: 22.93966265284586 %backward: 61.532415492390044 [2025-04-09 14:41:06,008] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25224.85 | forward: 145525.06 | backward_microstep: 390363.69 | backward: 390350.49 | backward_inner_microstep: 390332.98 | backward_inner: 390326.11 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.86 | reduce_tied_grads: 0.32 | comms: 18.30 | reduce_grads: 0.20 | step: 350.36 | _step_clipping: 1.86 | _step_step: 346.93 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.142 | iteration 21460/ 143000 | elapsed time per iteration (ms): 63438.7 | learning rate: 5.715E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.359333E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 14:51:41,827] [INFO] [logging.py:60:log_dist] [Rank 0] step=21470, skipped=24, lr=[0.0005714593395176543, 0.0005714593395176543], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21470 loss: 2.3435 iter time (s): 63.581 samples/sec: 16.105 %comms: 0.0029506589632963755 %optimizer_step 0.055464986363782415 %forward: 22.91718762067216 %backward: 61.41877403240675 [2025-04-09 14:51:41,827] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26293.80 | forward: 145710.52 | backward_microstep: 390522.32 | backward: 390508.71 | backward_inner_microstep: 390490.89 | backward_inner: 390484.00 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.91 | reduce_tied_grads: 0.36 | comms: 18.76 | reduce_grads: 0.21 | step: 352.65 | _step_clipping: 0.13 | _step_step: 350.66 | _step_zero_grad: 0.51 | _step_check_overflow: 0.69 samples/sec: 16.105 | iteration 21470/ 143000 | elapsed time per iteration (ms): 63582.0 | learning rate: 5.715E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.359694E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 15:02:18,354] [INFO] [logging.py:60:log_dist] [Rank 0] step=21480, skipped=24, lr=[0.0005714312761426691, 0.0005714312761426691], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21480 loss: 2.3531 iter time (s): 63.652 samples/sec: 16.087 %comms: 0.0028832910248509304 %optimizer_step 0.05627580880609511 %forward: 22.918411102934243 %backward: 61.351119838028936 [2025-04-09 15:02:18,354] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26818.27 | forward: 145880.45 | backward_microstep: 390528.21 | backward: 390512.64 | backward_inner_microstep: 390494.74 | backward_inner: 390487.79 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.90 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.22 | step: 358.21 | _step_clipping: 0.14 | _step_step: 356.26 | _step_zero_grad: 0.59 | _step_check_overflow: 0.57 samples/sec: 16.087 | iteration 21480/ 143000 | elapsed time per iteration (ms): 63652.7 | learning rate: 5.714E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.348899E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 15:12:52,517] [INFO] [logging.py:60:log_dist] [Rank 0] step=21490, skipped=24, lr=[0.0005714031996671933, 0.0005714031996671933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21490 loss: 2.3487 iter time (s): 63.416 samples/sec: 16.147 %comms: 0.002860125409508207 %optimizer_step 0.05525950271808009 %forward: 22.962376372305748 %backward: 61.556938048381305 [2025-04-09 15:12:52,517] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24917.60 | forward: 145617.58 | backward_microstep: 390379.82 | backward: 390367.80 | backward_inner_microstep: 390350.12 | backward_inner: 390343.23 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.14 | reduce_grads: 0.19 | step: 350.43 | _step_clipping: 0.14 | _step_step: 348.73 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.147 | iteration 21490/ 143000 | elapsed time per iteration (ms): 63416.3 | learning rate: 5.714E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.344014E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 15:23:20,208] [INFO] [logging.py:60:log_dist] [Rank 0] step=21500, skipped=24, lr=[0.0005713751100925821, 0.0005713751100925821], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21500 loss: 2.3369 iter time (s): 62.769 samples/sec: 16.314 %comms: 0.002940095286200084 %optimizer_step 0.05735354681027506 %forward: 23.198170814042914 %backward: 62.21214733094305 [2025-04-09 15:23:20,208] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18246.06 | forward: 145611.55 | backward_microstep: 390509.89 | backward: 390496.63 | backward_inner_microstep: 390479.45 | backward_inner: 390472.74 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 18.45 | reduce_grads: 0.21 | step: 360.00 | _step_clipping: 0.11 | _step_step: 358.09 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.314 | iteration 21500/ 143000 | elapsed time per iteration (ms): 62769.1 | learning rate: 5.714E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.341839E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 15:33:51,004] [INFO] [logging.py:60:log_dist] [Rank 0] step=21510, skipped=24, lr=[0.0005713470074201912, 0.0005713470074201912], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21510 loss: 2.3780 iter time (s): 63.079 samples/sec: 16.234 %comms: 0.0028916436608705247 %optimizer_step 0.05618999780582435 %forward: 23.113738403664634 %backward: 61.91482875844798 [2025-04-09 15:33:51,005] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21167.43 | forward: 145799.27 | backward_microstep: 390566.75 | backward: 390552.86 | backward_inner_microstep: 390535.13 | backward_inner: 390528.07 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 18.24 | reduce_grads: 0.22 | step: 354.44 | _step_clipping: 0.11 | _step_step: 352.56 | _step_zero_grad: 0.53 | _step_check_overflow: 0.63 samples/sec: 16.233 | iteration 21510/ 143000 | elapsed time per iteration (ms): 63079.6 | learning rate: 5.713E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.357546E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 15:44:26,350] [INFO] [logging.py:60:log_dist] [Rank 0] step=21520, skipped=24, lr=[0.0005713188916513769, 0.0005713188916513769], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21520 loss: 2.3516 iter time (s): 63.534 samples/sec: 16.117 %comms: 0.002858067500686328 %optimizer_step 0.05649300908009531 %forward: 22.896737602032978 %backward: 61.446735943920785 [2025-04-09 15:44:26,350] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26193.15 | forward: 145472.05 | backward_microstep: 390406.97 | backward: 390395.47 | backward_inner_microstep: 390377.15 | backward_inner: 390370.64 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.69 | reduce_tied_grads: 0.30 | comms: 18.16 | reduce_grads: 0.20 | step: 358.92 | _step_clipping: 0.12 | _step_step: 357.19 | _step_zero_grad: 0.52 | _step_check_overflow: 0.48 samples/sec: 16.117 | iteration 21520/ 143000 | elapsed time per iteration (ms): 63534.5 | learning rate: 5.713E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.354924E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 15:55:08,983] [INFO] [logging.py:60:log_dist] [Rank 0] step=21530, skipped=24, lr=[0.0005712907627874964, 0.0005712907627874964], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21530 loss: 2.3617 iter time (s): 64.263 samples/sec: 15.935 %comms: 0.00285144310646316 %optimizer_step 0.05743871278694043 %forward: 22.687442354525402 %backward: 60.762497516335436 [2025-04-09 15:55:08,983] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33054.24 | forward: 145795.58 | backward_microstep: 390489.84 | backward: 390476.08 | backward_inner_microstep: 390458.11 | backward_inner: 390451.28 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.90 | reduce_tied_grads: 0.36 | comms: 18.32 | reduce_grads: 0.22 | step: 369.12 | _step_clipping: 0.12 | _step_step: 367.00 | _step_zero_grad: 0.60 | _step_check_overflow: 0.74 samples/sec: 15.934 | iteration 21530/ 143000 | elapsed time per iteration (ms): 64263.3 | learning rate: 5.713E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.348990E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 16:05:48,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=21540, skipped=24, lr=[0.000571262620829907, 0.000571262620829907], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21540 loss: 2.3378 iter time (s): 63.996 samples/sec: 16.001 %comms: 0.0028414361068272893 %optimizer_step 0.058394212980193205 %forward: 22.794235788576312 %backward: 61.03407898058485 [2025-04-09 16:05:48,954] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30170.75 | forward: 145875.04 | backward_microstep: 390612.49 | backward: 390596.50 | backward_inner_microstep: 390578.06 | backward_inner: 390570.92 | backward_allreduce_microstep: 8.73 | backward_allreduce: 2.99 | reduce_tied_grads: 0.35 | comms: 18.18 | reduce_grads: 0.23 | step: 373.70 | _step_clipping: 0.14 | _step_step: 371.92 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.001 | iteration 21540/ 143000 | elapsed time per iteration (ms): 63997.1 | learning rate: 5.713E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.353680E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 16:16:35,390] [INFO] [logging.py:60:log_dist] [Rank 0] step=21550, skipped=24, lr=[0.0005712344657799673, 0.0005712344657799673], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21550 loss: 2.3482 iter time (s): 64.643 samples/sec: 15.841 %comms: 0.0031984344878822966 %optimizer_step 0.05636499450330666 %forward: 22.570306677959216 %backward: 60.3935958295058 [2025-04-09 16:16:35,391] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36841.36 | forward: 145901.37 | backward_microstep: 390418.19 | backward: 390402.68 | backward_inner_microstep: 390384.22 | backward_inner: 390377.14 | backward_allreduce_microstep: 8.71 | backward_allreduce: 3.00 | reduce_tied_grads: 0.33 | comms: 20.68 | reduce_grads: 0.21 | step: 364.36 | _step_clipping: 0.12 | _step_step: 362.50 | _step_zero_grad: 0.57 | _step_check_overflow: 0.55 samples/sec: 15.841 | iteration 21550/ 143000 | elapsed time per iteration (ms): 64643.7 | learning rate: 5.712E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.357811E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 16:27:17,568] [INFO] [logging.py:60:log_dist] [Rank 0] step=21560, skipped=24, lr=[0.0005712062976390359, 0.0005712062976390359], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21560 loss: 2.3547 iter time (s): 64.217 samples/sec: 15.946 %comms: 0.002865568574215084 %optimizer_step 0.0568511088148726 %forward: 22.73844208689253 %backward: 60.81774702520044 [2025-04-09 16:27:17,568] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32295.28 | forward: 146019.77 | backward_microstep: 390568.34 | backward: 390554.16 | backward_inner_microstep: 390535.72 | backward_inner: 390528.53 | backward_allreduce_microstep: 8.74 | backward_allreduce: 3.00 | reduce_tied_grads: 0.37 | comms: 18.40 | reduce_grads: 0.24 | step: 365.08 | _step_clipping: 0.13 | _step_step: 363.22 | _step_zero_grad: 0.57 | _step_check_overflow: 0.50 samples/sec: 15.946 | iteration 21560/ 143000 | elapsed time per iteration (ms): 64217.8 | learning rate: 5.712E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.365496E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 16:37:53,277] [INFO] [logging.py:60:log_dist] [Rank 0] step=21570, skipped=24, lr=[0.0005711781164084725, 0.0005711781164084725], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21570 loss: 2.3566 iter time (s): 63.570 samples/sec: 16.108 %comms: 0.0029007980311222944 %optimizer_step 0.05711502364780474 %forward: 22.932270817776594 %backward: 61.40337379594355 [2025-04-09 16:37:53,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26290.00 | forward: 145781.33 | backward_microstep: 390356.99 | backward: 390343.62 | backward_inner_microstep: 390326.04 | backward_inner: 390319.19 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.44 | reduce_grads: 0.21 | step: 363.08 | _step_clipping: 0.12 | _step_step: 361.16 | _step_zero_grad: 0.60 | _step_check_overflow: 0.57 samples/sec: 16.108 | iteration 21570/ 143000 | elapsed time per iteration (ms): 63571.0 | learning rate: 5.712E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.356152E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 16:48:34,790] [INFO] [logging.py:60:log_dist] [Rank 0] step=21580, skipped=24, lr=[0.0005711499220896372, 0.0005711499220896372], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21580 loss: 2.3782 iter time (s): 64.151 samples/sec: 15.962 %comms: 0.0028218950384098873 %optimizer_step 0.05720243035985208 %forward: 22.70566886461076 %backward: 60.84049628606495 [2025-04-09 16:48:34,791] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32276.95 | forward: 145658.39 | backward_microstep: 390305.98 | backward: 390295.86 | backward_inner_microstep: 390277.89 | backward_inner: 390269.43 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.98 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.21 | step: 366.96 | _step_clipping: 0.13 | _step_step: 364.83 | _step_zero_grad: 0.60 | _step_check_overflow: 0.69 samples/sec: 15.962 | iteration 21580/ 143000 | elapsed time per iteration (ms): 64151.3 | learning rate: 5.711E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.358149E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 16:59:16,273] [INFO] [logging.py:60:log_dist] [Rank 0] step=21590, skipped=24, lr=[0.0005711217146838908, 0.0005711217146838908], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21590 loss: 2.3551 iter time (s): 64.148 samples/sec: 15.963 %comms: 0.002855551600071265 %optimizer_step 0.05548768269469208 %forward: 22.69352691097436 %backward: 60.86593855584257 [2025-04-09 16:59:16,274] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32154.92 | forward: 145573.70 | backward_microstep: 390454.58 | backward: 390440.84 | backward_inner_microstep: 390422.72 | backward_inner: 390415.71 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.95 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.23 | step: 355.94 | _step_clipping: 0.14 | _step_step: 353.92 | _step_zero_grad: 0.54 | _step_check_overflow: 0.70 samples/sec: 15.963 | iteration 21590/ 143000 | elapsed time per iteration (ms): 64148.3 | learning rate: 5.711E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.360695E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 17:09:58,495] [INFO] [logging.py:60:log_dist] [Rank 0] step=21600, skipped=24, lr=[0.0005710934941925948, 0.0005710934941925948], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21600 loss: 2.3544 iter time (s): 64.222 samples/sec: 15.945 %comms: 0.0028652987836721463 %optimizer_step 0.05674606538133291 %forward: 22.698353908438982 %backward: 60.7965087056205 [2025-04-09 17:09:58,496] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32662.52 | forward: 145772.28 | backward_microstep: 390456.07 | backward: 390444.42 | backward_inner_microstep: 390426.04 | backward_inner: 390418.90 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.03 | reduce_tied_grads: 0.35 | comms: 18.40 | reduce_grads: 0.23 | step: 364.43 | _step_clipping: 0.13 | _step_step: 362.25 | _step_zero_grad: 0.61 | _step_check_overflow: 0.76 samples/sec: 15.945 | iteration 21600/ 143000 | elapsed time per iteration (ms): 64222.2 | learning rate: 5.711E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.350688E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 17:20:26,593] [INFO] [logging.py:60:log_dist] [Rank 0] step=21610, skipped=24, lr=[0.000571065260617111, 0.000571065260617111], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21610 loss: 2.3622 iter time (s): 62.809 samples/sec: 16.303 %comms: 0.0029241130982446085 %optimizer_step 0.05782978081985399 %forward: 23.179044933007198 %backward: 62.18450720762904 [2025-04-09 17:20:26,594] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18565.66 | forward: 145585.55 | backward_microstep: 390592.15 | backward: 390575.45 | backward_inner_microstep: 390557.45 | backward_inner: 390550.23 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.87 | reduce_tied_grads: 0.34 | comms: 18.37 | reduce_grads: 0.22 | step: 363.22 | _step_clipping: 0.13 | _step_step: 361.20 | _step_zero_grad: 0.60 | _step_check_overflow: 0.64 samples/sec: 16.303 | iteration 21610/ 143000 | elapsed time per iteration (ms): 62809.8 | learning rate: 5.711E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.344377E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 17:31:03,204] [INFO] [logging.py:60:log_dist] [Rank 0] step=21620, skipped=24, lr=[0.0005710370139588023, 0.0005710370139588023], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21620 loss: 2.3379 iter time (s): 63.660 samples/sec: 16.085 %comms: 0.0028776679893049743 %optimizer_step 0.056997727964329656 %forward: 22.894134900108508 %backward: 61.34407130204301 [2025-04-09 17:31:03,205] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26990.64 | forward: 145745.13 | backward_microstep: 390540.26 | backward: 390519.21 | backward_inner_microstep: 390501.29 | backward_inner: 390492.24 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.85 | reduce_tied_grads: 0.37 | comms: 18.32 | reduce_grads: 0.22 | step: 362.85 | _step_clipping: 0.12 | _step_step: 360.99 | _step_zero_grad: 0.58 | _step_check_overflow: 0.51 samples/sec: 16.085 | iteration 21620/ 143000 | elapsed time per iteration (ms): 63661.1 | learning rate: 5.710E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.342758E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 17:41:32,699] [INFO] [logging.py:60:log_dist] [Rank 0] step=21630, skipped=24, lr=[0.0005710087542190319, 0.0005710087542190319], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21630 loss: 2.3316 iter time (s): 62.949 samples/sec: 16.267 %comms: 0.0029089454704941844 %optimizer_step 0.05653741031229479 %forward: 23.11527986939033 %backward: 61.9953354617759 [2025-04-09 17:41:32,700] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20422.93 | forward: 145508.21 | backward_microstep: 390265.25 | backward: 390254.00 | backward_inner_microstep: 390236.89 | backward_inner: 390230.38 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.82 | reduce_tied_grads: 0.34 | comms: 18.31 | reduce_grads: 0.23 | step: 355.90 | _step_clipping: 0.14 | _step_step: 354.02 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.267 | iteration 21630/ 143000 | elapsed time per iteration (ms): 62949.5 | learning rate: 5.710E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.341768E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 17:52:08,082] [INFO] [logging.py:60:log_dist] [Rank 0] step=21640, skipped=24, lr=[0.0005709804813991638, 0.0005709804813991638], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21640 loss: 2.3494 iter time (s): 63.538 samples/sec: 16.116 %comms: 0.0029300199486200266 %optimizer_step 0.05596906964914569 %forward: 22.920364203791088 %backward: 61.45353013783963 [2025-04-09 17:52:08,083] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25957.28 | forward: 145630.75 | backward_microstep: 390474.99 | backward: 390461.67 | backward_inner_microstep: 390444.30 | backward_inner: 390437.57 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.78 | reduce_tied_grads: 0.36 | comms: 18.62 | reduce_grads: 0.21 | step: 355.61 | _step_clipping: 0.13 | _step_step: 353.80 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.116 | iteration 21640/ 143000 | elapsed time per iteration (ms): 63538.3 | learning rate: 5.710E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.347394E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 18:02:36,411] [INFO] [logging.py:60:log_dist] [Rank 0] step=21650, skipped=24, lr=[0.0005709521955005626, 0.0005709521955005626], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21650 loss: 2.3346 iter time (s): 62.832 samples/sec: 16.297 %comms: 0.0028924098822243147 %optimizer_step 0.05608921052066516 %forward: 23.17469227701123 %backward: 62.156935337664734 [2025-04-09 18:02:36,412] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18826.85 | forward: 145612.07 | backward_microstep: 390558.90 | backward: 390546.71 | backward_inner_microstep: 390529.87 | backward_inner: 390523.26 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.71 | reduce_tied_grads: 0.30 | comms: 18.17 | reduce_grads: 0.22 | step: 352.42 | _step_clipping: 0.13 | _step_step: 350.63 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.297 | iteration 21650/ 143000 | elapsed time per iteration (ms): 62832.9 | learning rate: 5.710E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.338211E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 18:13:05,585] [INFO] [logging.py:60:log_dist] [Rank 0] step=21660, skipped=24, lr=[0.0005709238965245934, 0.0005709238965245934], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21660 loss: 2.3619 iter time (s): 62.917 samples/sec: 16.275 %comms: 0.002884511145883887 %optimizer_step 0.05755588776249962 %forward: 23.144592972090873 %backward: 62.09320272576587 [2025-04-09 18:13:05,586] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19545.52 | forward: 145618.38 | backward_microstep: 390684.75 | backward: 390670.59 | backward_inner_microstep: 390653.66 | backward_inner: 390646.93 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 18.15 | reduce_grads: 0.19 | step: 362.12 | _step_clipping: 0.13 | _step_step: 360.48 | _step_zero_grad: 0.46 | _step_check_overflow: 0.46 samples/sec: 16.275 | iteration 21660/ 143000 | elapsed time per iteration (ms): 62917.4 | learning rate: 5.709E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.355159E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 18:23:33,297] [INFO] [logging.py:60:log_dist] [Rank 0] step=21670, skipped=24, lr=[0.0005708955844726221, 0.0005708955844726221], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21670 loss: 2.3495 iter time (s): 62.771 samples/sec: 16.313 %comms: 0.002917887505584013 %optimizer_step 0.056351669216950555 %forward: 23.196924637271195 %backward: 62.20470333786417 [2025-04-09 18:23:33,298] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18335.97 | forward: 145608.78 | backward_microstep: 390475.18 | backward: 390463.45 | backward_inner_microstep: 390446.60 | backward_inner: 390440.21 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.70 | reduce_tied_grads: 0.32 | comms: 18.32 | reduce_grads: 0.21 | step: 353.72 | _step_clipping: 0.13 | _step_step: 351.59 | _step_zero_grad: 0.49 | _step_check_overflow: 0.69 samples/sec: 16.313 | iteration 21670/ 143000 | elapsed time per iteration (ms): 62771.3 | learning rate: 5.709E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.341762E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 18:34:08,481] [INFO] [logging.py:60:log_dist] [Rank 0] step=21680, skipped=24, lr=[0.0005708672593460151, 0.0005708672593460151], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21680 loss: 2.3501 iter time (s): 63.518 samples/sec: 16.121 %comms: 0.002896711397317815 %optimizer_step 0.055204550780599985 %forward: 22.93094340774834 %backward: 61.48851448901169 [2025-04-09 18:34:08,481] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25663.02 | forward: 145652.03 | backward_microstep: 390572.98 | backward: 390560.78 | backward_inner_microstep: 390544.12 | backward_inner: 390537.59 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.40 | reduce_grads: 0.21 | step: 350.65 | _step_clipping: 0.12 | _step_step: 348.89 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.121 | iteration 21680/ 143000 | elapsed time per iteration (ms): 63518.3 | learning rate: 5.709E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.352199E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 18:44:40,927] [INFO] [logging.py:60:log_dist] [Rank 0] step=21690, skipped=24, lr=[0.0005708389211461397, 0.0005708389211461397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21690 loss: 2.3512 iter time (s): 63.244 samples/sec: 16.191 %comms: 0.0029019332117504065 %optimizer_step 0.056263266788888655 %forward: 23.031852993107513 %backward: 61.74571374327574 [2025-04-09 18:44:40,928] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22971.98 | forward: 145662.64 | backward_microstep: 390517.63 | backward: 390504.57 | backward_inner_microstep: 390487.82 | backward_inner: 390481.28 | backward_allreduce_microstep: 7.88 | backward_allreduce: 2.71 | reduce_tied_grads: 0.32 | comms: 18.35 | reduce_grads: 0.21 | step: 355.83 | _step_clipping: 0.12 | _step_step: 354.03 | _step_zero_grad: 0.53 | _step_check_overflow: 0.52 samples/sec: 16.191 | iteration 21690/ 143000 | elapsed time per iteration (ms): 63244.6 | learning rate: 5.708E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.349752E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 18:55:13,664] [INFO] [logging.py:60:log_dist] [Rank 0] step=21700, skipped=24, lr=[0.0005708105698743633, 0.0005708105698743633], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21700 loss: 2.3406 iter time (s): 63.273 samples/sec: 16.184 %comms: 0.0029240749784552804 %optimizer_step 0.05659259408341619 %forward: 23.010849915086315 %backward: 61.70783755833492 [2025-04-09 18:55:13,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23371.32 | forward: 145596.71 | backward_microstep: 390457.83 | backward: 390444.43 | backward_inner_microstep: 390427.52 | backward_inner: 390420.78 | backward_allreduce_microstep: 7.93 | backward_allreduce: 2.74 | reduce_tied_grads: 0.30 | comms: 18.50 | reduce_grads: 0.20 | step: 358.08 | _step_clipping: 0.15 | _step_step: 355.94 | _step_zero_grad: 0.63 | _step_check_overflow: 0.73 samples/sec: 16.184 | iteration 21700/ 143000 | elapsed time per iteration (ms): 63273.7 | learning rate: 5.708E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.345162E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 19:05:56,664] [INFO] [logging.py:60:log_dist] [Rank 0] step=21710, skipped=24, lr=[0.0005707822055320545, 0.0005707822055320545], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21710 loss: 2.3652 iter time (s): 64.299 samples/sec: 15.926 %comms: 0.002949893174459039 %optimizer_step 0.05480058034687549 %forward: 22.645383039342317 %backward: 60.70353235432051 [2025-04-09 19:05:56,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33776.00 | forward: 145608.40 | backward_microstep: 390332.19 | backward: 390319.91 | backward_inner_microstep: 390303.55 | backward_inner: 390297.08 | backward_allreduce_microstep: 7.74 | backward_allreduce: 2.66 | reduce_tied_grads: 0.31 | comms: 18.97 | reduce_grads: 0.21 | step: 352.36 | _step_clipping: 0.11 | _step_step: 350.49 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 15.925 | iteration 21710/ 143000 | elapsed time per iteration (ms): 64300.0 | learning rate: 5.708E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.348030E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 19:16:31,575] [INFO] [logging.py:60:log_dist] [Rank 0] step=21720, skipped=24, lr=[0.0005707538281205824, 0.0005707538281205824], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21720 loss: 2.3548 iter time (s): 63.491 samples/sec: 16.128 %comms: 0.002929306534363196 %optimizer_step 0.05984591988073139 %forward: 22.928152506177373 %backward: 61.507462186746544 [2025-04-09 19:16:31,576] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25463.09 | forward: 145572.02 | backward_microstep: 390529.39 | backward: 390514.05 | backward_inner_microstep: 390497.03 | backward_inner: 390490.20 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.72 | reduce_tied_grads: 0.38 | comms: 18.60 | reduce_grads: 0.24 | step: 379.96 | _step_clipping: 0.14 | _step_step: 377.85 | _step_zero_grad: 0.56 | _step_check_overflow: 0.73 samples/sec: 16.128 | iteration 21720/ 143000 | elapsed time per iteration (ms): 63491.1 | learning rate: 5.708E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.358226E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 19:27:10,770] [INFO] [logging.py:60:log_dist] [Rank 0] step=21730, skipped=24, lr=[0.0005707254376413163, 0.0005707254376413163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21730 loss: 2.3413 iter time (s): 63.919 samples/sec: 16.020 %comms: 0.0028312713402927625 %optimizer_step 0.05588337671635148 %forward: 22.787246255232418 %backward: 61.100630744627594 [2025-04-09 19:27:10,770] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29668.12 | forward: 145653.47 | backward_microstep: 390561.65 | backward: 390548.24 | backward_inner_microstep: 390531.14 | backward_inner: 390524.34 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.79 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.20 | step: 357.20 | _step_clipping: 0.12 | _step_step: 355.40 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.020 | iteration 21730/ 143000 | elapsed time per iteration (ms): 63919.4 | learning rate: 5.707E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.345161E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 19:37:49,849] [INFO] [logging.py:60:log_dist] [Rank 0] step=21740, skipped=24, lr=[0.0005706970340956267, 0.0005706970340956267], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21740 loss: 2.3405 iter time (s): 63.907 samples/sec: 16.023 %comms: 0.0028655090790199985 %optimizer_step 0.05722586788645023 %forward: 22.796939109529553 %backward: 61.11406525112657 [2025-04-09 19:37:49,850] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29463.95 | forward: 145689.07 | backward_microstep: 390575.68 | backward: 390563.44 | backward_inner_microstep: 390546.12 | backward_inner: 390539.43 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.83 | reduce_tied_grads: 0.31 | comms: 18.31 | reduce_grads: 0.21 | step: 365.72 | _step_clipping: 0.12 | _step_step: 363.86 | _step_zero_grad: 0.57 | _step_check_overflow: 0.52 samples/sec: 16.023 | iteration 21740/ 143000 | elapsed time per iteration (ms): 63907.9 | learning rate: 5.707E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.343096E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 19:48:30,125] [INFO] [logging.py:60:log_dist] [Rank 0] step=21750, skipped=24, lr=[0.0005706686174848843, 0.0005706686174848843], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21750 loss: 2.3608 iter time (s): 64.027 samples/sec: 15.993 %comms: 0.002831516236151113 %optimizer_step 0.05730416898669126 %forward: 22.769254530570738 %backward: 60.98857932093087 [2025-04-09 19:48:30,126] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30659.94 | forward: 145784.70 | backward_microstep: 390506.28 | backward: 390491.57 | backward_inner_microstep: 390474.16 | backward_inner: 390467.39 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.13 | reduce_grads: 0.23 | step: 366.90 | _step_clipping: 0.15 | _step_step: 364.94 | _step_zero_grad: 0.59 | _step_check_overflow: 0.60 samples/sec: 15.993 | iteration 21750/ 143000 | elapsed time per iteration (ms): 64027.6 | learning rate: 5.707E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.352996E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 19:59:02,367] [INFO] [logging.py:60:log_dist] [Rank 0] step=21760, skipped=24, lr=[0.0005706401878104607, 0.0005706401878104607], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21760 loss: 2.3497 iter time (s): 63.224 samples/sec: 16.196 %comms: 0.0028581457875590964 %optimizer_step 0.0575102661423737 %forward: 23.01744496085981 %backward: 61.74853194983692 [2025-04-09 19:59:02,367] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22979.70 | forward: 145524.53 | backward_microstep: 390408.60 | backward: 390396.33 | backward_inner_microstep: 390378.84 | backward_inner: 390372.04 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.21 | step: 363.60 | _step_clipping: 0.14 | _step_step: 361.74 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 16.196 | iteration 21760/ 143000 | elapsed time per iteration (ms): 63224.2 | learning rate: 5.706E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.344031E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 20:09:31,708] [INFO] [logging.py:60:log_dist] [Rank 0] step=21770, skipped=24, lr=[0.0005706117450737282, 0.0005706117450737282], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21770 loss: 2.3297 iter time (s): 62.934 samples/sec: 16.271 %comms: 0.002936441314757569 %optimizer_step 0.05845677993939189 %forward: 23.123327879538223 %backward: 62.071671489443794 [2025-04-09 20:09:31,709] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19812.65 | forward: 145523.27 | backward_microstep: 390654.48 | backward: 390638.96 | backward_inner_microstep: 390620.64 | backward_inner: 390613.40 | backward_allreduce_microstep: 8.66 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 18.48 | reduce_grads: 0.22 | step: 367.89 | _step_clipping: 0.12 | _step_step: 366.04 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.271 | iteration 21770/ 143000 | elapsed time per iteration (ms): 62934.1 | learning rate: 5.706E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.343473E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 20:20:10,346] [INFO] [logging.py:60:log_dist] [Rank 0] step=21780, skipped=24, lr=[0.0005705832892760593, 0.0005705832892760593], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21780 loss: 2.3420 iter time (s): 63.863 samples/sec: 16.034 %comms: 0.0028652087668020287 %optimizer_step 0.056322400434422805 %forward: 22.815680406977375 %backward: 61.15520616597586 [2025-04-09 20:20:10,347] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29032.70 | forward: 145708.31 | backward_microstep: 390570.20 | backward: 390556.91 | backward_inner_microstep: 390539.62 | backward_inner: 390532.75 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.81 | reduce_tied_grads: 0.33 | comms: 18.30 | reduce_grads: 0.21 | step: 359.69 | _step_clipping: 0.14 | _step_step: 357.75 | _step_zero_grad: 0.52 | _step_check_overflow: 0.64 samples/sec: 16.034 | iteration 21780/ 143000 | elapsed time per iteration (ms): 63863.8 | learning rate: 5.706E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.351057E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 20:30:55,909] [INFO] [logging.py:60:log_dist] [Rank 0] step=21790, skipped=24, lr=[0.0005705548204188275, 0.0005705548204188275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21790 loss: 2.3584 iter time (s): 64.556 samples/sec: 15.862 %comms: 0.0028113547896572023 %optimizer_step 0.05615973141574546 %forward: 22.562830930227722 %backward: 60.48842976894445 [2025-04-09 20:30:55,910] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36062.90 | forward: 145655.95 | backward_microstep: 390500.53 | backward: 390487.32 | backward_inner_microstep: 390469.96 | backward_inner: 390462.78 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.80 | reduce_tied_grads: 0.32 | comms: 18.15 | reduce_grads: 0.20 | step: 362.54 | _step_clipping: 0.12 | _step_step: 360.67 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 15.862 | iteration 21790/ 143000 | elapsed time per iteration (ms): 64556.3 | learning rate: 5.706E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.353715E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 20:41:37,346] [INFO] [logging.py:60:log_dist] [Rank 0] step=21800, skipped=24, lr=[0.000570526338503407, 0.000570526338503407], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21800 loss: 2.3393 iter time (s): 64.143 samples/sec: 15.964 %comms: 0.00281587168175372 %optimizer_step 0.05996672631882506 %forward: 22.716937018745835 %backward: 60.869577995718224 [2025-04-09 20:41:37,347] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31921.27 | forward: 145713.50 | backward_microstep: 390447.32 | backward: 390436.39 | backward_inner_microstep: 390418.91 | backward_inner: 390412.16 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.21 | step: 384.65 | _step_clipping: 0.13 | _step_step: 382.72 | _step_zero_grad: 0.54 | _step_check_overflow: 0.65 samples/sec: 15.964 | iteration 21800/ 143000 | elapsed time per iteration (ms): 64143.7 | learning rate: 5.705E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.348045E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 20:52:14,253] [INFO] [logging.py:60:log_dist] [Rank 0] step=21810, skipped=24, lr=[0.0005704978435311722, 0.0005704978435311722], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21810 loss: 2.3557 iter time (s): 63.690 samples/sec: 16.078 %comms: 0.002831707717919166 %optimizer_step 0.05697124905069427 %forward: 22.888593759314265 %backward: 61.35530511621462 [2025-04-09 20:52:14,254] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26967.96 | forward: 145777.67 | backward_microstep: 390785.78 | backward: 390772.52 | backward_inner_microstep: 390754.88 | backward_inner: 390748.10 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.21 | step: 362.85 | _step_clipping: 0.12 | _step_step: 360.98 | _step_zero_grad: 0.48 | _step_check_overflow: 0.64 samples/sec: 16.078 | iteration 21810/ 143000 | elapsed time per iteration (ms): 63690.7 | learning rate: 5.705E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.344778E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 21:02:48,619] [INFO] [logging.py:60:log_dist] [Rank 0] step=21820, skipped=24, lr=[0.0005704693355034986, 0.0005704693355034986], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21820 loss: 2.3623 iter time (s): 63.436 samples/sec: 16.142 %comms: 0.0028518068364348 %optimizer_step 0.05606872182622319 %forward: 22.931098131248653 %backward: 61.516691788942104 [2025-04-09 21:02:48,620] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25380.24 | forward: 145465.74 | backward_microstep: 390247.52 | backward: 390237.35 | backward_inner_microstep: 390220.25 | backward_inner: 390213.74 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.76 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.20 | step: 355.68 | _step_clipping: 0.12 | _step_step: 353.93 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.142 | iteration 21820/ 143000 | elapsed time per iteration (ms): 63436.6 | learning rate: 5.705E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.359684E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 21:13:32,417] [INFO] [logging.py:60:log_dist] [Rank 0] step=21830, skipped=24, lr=[0.000570440814421762, 0.000570440814421762], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21830 loss: 2.3420 iter time (s): 64.379 samples/sec: 15.906 %comms: 0.0028282104834004475 %optimizer_step 0.05629908692382395 %forward: 22.635371679661716 %backward: 60.643279072348655 [2025-04-09 21:13:32,417] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34319.73 | forward: 145724.68 | backward_microstep: 390429.88 | backward: 390416.49 | backward_inner_microstep: 390398.54 | backward_inner: 390389.72 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.98 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.22 | step: 362.45 | _step_clipping: 0.12 | _step_step: 360.50 | _step_zero_grad: 0.59 | _step_check_overflow: 0.62 samples/sec: 15.906 | iteration 21830/ 143000 | elapsed time per iteration (ms): 64379.8 | learning rate: 5.704E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.348091E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 21:24:05,685] [INFO] [logging.py:60:log_dist] [Rank 0] step=21840, skipped=24, lr=[0.000570412280287339, 0.000570412280287339], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21840 loss: 2.3483 iter time (s): 63.326 samples/sec: 16.170 %comms: 0.002826176245736028 %optimizer_step 0.05550338168670171 %forward: 22.985228494726787 %backward: 61.678498285014086 [2025-04-09 21:24:05,686] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23786.31 | forward: 145556.95 | backward_microstep: 390602.11 | backward: 390587.12 | backward_inner_microstep: 390567.70 | backward_inner: 390560.93 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.73 | reduce_tied_grads: 0.27 | comms: 17.90 | reduce_grads: 0.19 | step: 351.48 | _step_clipping: 0.12 | _step_step: 349.79 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.170 | iteration 21840/ 143000 | elapsed time per iteration (ms): 63326.9 | learning rate: 5.704E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.342640E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 21:34:35,170] [INFO] [logging.py:60:log_dist] [Rank 0] step=21850, skipped=24, lr=[0.0005703837331016068, 0.0005703837331016068], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21850 loss: 2.3403 iter time (s): 62.948 samples/sec: 16.267 %comms: 0.002934448610860336 %optimizer_step 0.0611225920038969 %forward: 23.14073722725584 %backward: 62.07244013565189 [2025-04-09 21:34:35,171] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19659.43 | forward: 145665.92 | backward_microstep: 390752.64 | backward: 390732.55 | backward_inner_microstep: 390714.43 | backward_inner: 390707.24 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.47 | reduce_grads: 0.20 | step: 384.75 | _step_clipping: 0.12 | _step_step: 382.66 | _step_zero_grad: 0.58 | _step_check_overflow: 0.71 samples/sec: 16.267 | iteration 21850/ 143000 | elapsed time per iteration (ms): 62948.5 | learning rate: 5.704E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.349030E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 21:45:09,007] [INFO] [logging.py:60:log_dist] [Rank 0] step=21860, skipped=24, lr=[0.0005703551728659432, 0.0005703551728659432], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21860 loss: 2.3392 iter time (s): 63.383 samples/sec: 16.156 %comms: 0.0028899964263650072 %optimizer_step 0.05897492290931712 %forward: 22.995374046704228 %backward: 61.638126687449656 [2025-04-09 21:45:09,008] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24023.23 | forward: 145751.86 | backward_microstep: 390698.23 | backward: 390681.69 | backward_inner_microstep: 390663.66 | backward_inner: 390656.71 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.92 | reduce_tied_grads: 0.35 | comms: 18.32 | reduce_grads: 0.23 | step: 373.80 | _step_clipping: 0.13 | _step_step: 371.98 | _step_zero_grad: 0.56 | _step_check_overflow: 0.49 samples/sec: 16.156 | iteration 21860/ 143000 | elapsed time per iteration (ms): 63383.7 | learning rate: 5.704E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.343989E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 21:55:49,667] [INFO] [logging.py:60:log_dist] [Rank 0] step=21870, skipped=24, lr=[0.0005703265995817266, 0.0005703265995817266], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21870 loss: 2.3450 iter time (s): 64.065 samples/sec: 15.984 %comms: 0.0028698997376893328 %optimizer_step 0.05566936367612973 %forward: 22.74734175178603 %backward: 60.93749661635353 [2025-04-09 21:55:49,667] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31198.41 | forward: 145731.74 | backward_microstep: 390411.10 | backward: 390398.47 | backward_inner_microstep: 390380.90 | backward_inner: 390373.98 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 18.39 | reduce_grads: 0.20 | step: 356.65 | _step_clipping: 0.12 | _step_step: 354.88 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 15.984 | iteration 21870/ 143000 | elapsed time per iteration (ms): 64066.0 | learning rate: 5.703E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.339977E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 22:06:29,455] [INFO] [logging.py:60:log_dist] [Rank 0] step=21880, skipped=24, lr=[0.0005702980132503362, 0.0005702980132503362], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21880 loss: 2.3446 iter time (s): 63.978 samples/sec: 16.005 %comms: 0.002840531318536042 %optimizer_step 0.05797864134129278 %forward: 22.756973898377318 %backward: 61.03027540677647 [2025-04-09 22:06:29,455] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30375.50 | forward: 145595.10 | backward_microstep: 390473.24 | backward: 390460.92 | backward_inner_microstep: 390443.08 | backward_inner: 390436.14 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.17 | reduce_grads: 0.22 | step: 370.94 | _step_clipping: 0.15 | _step_step: 368.77 | _step_zero_grad: 0.59 | _step_check_overflow: 0.57 samples/sec: 16.005 | iteration 21880/ 143000 | elapsed time per iteration (ms): 63978.8 | learning rate: 5.703E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.341390E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 22:17:04,585] [INFO] [logging.py:60:log_dist] [Rank 0] step=21890, skipped=24, lr=[0.0005702694138731516, 0.0005702694138731516], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21890 loss: 2.3506 iter time (s): 63.512 samples/sec: 16.123 %comms: 0.0028800602047226614 %optimizer_step 0.05829642469599469 %forward: 22.90295360140575 %backward: 61.47524165190532 [2025-04-09 22:17:04,586] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25873.00 | forward: 145462.13 | backward_microstep: 390459.20 | backward: 390443.94 | backward_inner_microstep: 390424.28 | backward_inner: 390417.37 | backward_allreduce_microstep: 8.43 | backward_allreduce: 2.89 | reduce_tied_grads: 0.37 | comms: 18.29 | reduce_grads: 0.22 | step: 370.25 | _step_clipping: 0.13 | _step_step: 368.31 | _step_zero_grad: 0.59 | _step_check_overflow: 0.57 samples/sec: 16.123 | iteration 21890/ 143000 | elapsed time per iteration (ms): 63513.1 | learning rate: 5.703E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.354571E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 22:27:34,364] [INFO] [logging.py:60:log_dist] [Rank 0] step=21900, skipped=24, lr=[0.0005702408014515532, 0.0005702408014515532], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21900 loss: 2.3254 iter time (s): 62.977 samples/sec: 16.260 %comms: 0.003350581472111807 %optimizer_step 0.06024637296063246 %forward: 23.137736329250625 %backward: 62.04676534717415 [2025-04-09 22:27:34,364] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19876.82 | forward: 145714.81 | backward_microstep: 390769.96 | backward: 390752.67 | backward_inner_microstep: 390734.67 | backward_inner: 390727.46 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.94 | reduce_tied_grads: 0.32 | comms: 21.10 | reduce_grads: 0.20 | step: 379.41 | _step_clipping: 0.13 | _step_step: 377.28 | _step_zero_grad: 0.63 | _step_check_overflow: 0.67 samples/sec: 16.260 | iteration 21900/ 143000 | elapsed time per iteration (ms): 62977.8 | learning rate: 5.702E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.346845E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 22:38:11,267] [INFO] [logging.py:60:log_dist] [Rank 0] step=21910, skipped=24, lr=[0.0005702121759869218, 0.0005702121759869218], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21910 loss: 2.3464 iter time (s): 63.690 samples/sec: 16.078 %comms: 0.002861413638901494 %optimizer_step 0.05811812682458169 %forward: 22.88144708315861 %backward: 61.32436326763414 [2025-04-09 22:38:11,267] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27166.27 | forward: 145731.10 | backward_microstep: 390587.56 | backward: 390572.64 | backward_inner_microstep: 390554.89 | backward_inner: 390545.80 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.27 | step: 370.15 | _step_clipping: 0.14 | _step_step: 368.15 | _step_zero_grad: 0.57 | _step_check_overflow: 0.58 samples/sec: 16.078 | iteration 21910/ 143000 | elapsed time per iteration (ms): 63690.3 | learning rate: 5.702E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.348128E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 22:48:39,335] [INFO] [logging.py:60:log_dist] [Rank 0] step=21920, skipped=24, lr=[0.0005701835374806392, 0.0005701835374806392], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21920 loss: 2.3226 iter time (s): 62.806 samples/sec: 16.304 %comms: 0.002916578411392152 %optimizer_step 0.05808577102080729 %forward: 23.196265643841535 %backward: 62.16304172885897 [2025-04-09 22:48:39,336] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18595.14 | forward: 145687.07 | backward_microstep: 390434.48 | backward: 390422.81 | backward_inner_microstep: 390405.62 | backward_inner: 390398.91 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.81 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.21 | step: 364.81 | _step_clipping: 0.12 | _step_step: 362.96 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.304 | iteration 21920/ 143000 | elapsed time per iteration (ms): 62806.9 | learning rate: 5.702E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.344633E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 22:59:10,079] [INFO] [logging.py:60:log_dist] [Rank 0] step=21930, skipped=24, lr=[0.0005701548859340876, 0.0005701548859340876], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21930 loss: 2.3352 iter time (s): 63.074 samples/sec: 16.235 %comms: 0.002919519660558189 %optimizer_step 0.05699073535742784 %forward: 23.069423379573166 %backward: 61.90912179106031 [2025-04-09 22:59:10,080] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21389.59 | forward: 145507.44 | backward_microstep: 390497.41 | backward: 390483.88 | backward_inner_microstep: 390466.73 | backward_inner: 390459.78 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.79 | reduce_tied_grads: 0.35 | comms: 18.41 | reduce_grads: 0.20 | step: 359.46 | _step_clipping: 0.12 | _step_step: 357.50 | _step_zero_grad: 0.58 | _step_check_overflow: 0.63 samples/sec: 16.235 | iteration 21930/ 143000 | elapsed time per iteration (ms): 63074.4 | learning rate: 5.702E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.342191E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 23:09:47,561] [INFO] [logging.py:60:log_dist] [Rank 0] step=21940, skipped=24, lr=[0.0005701262213486497, 0.0005701262213486497], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21940 loss: 2.3595 iter time (s): 63.748 samples/sec: 16.063 %comms: 0.002853426274469415 %optimizer_step 0.05713913744847025 %forward: 22.890693396452765 %backward: 61.293426115472826 [2025-04-09 23:09:47,562] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27415.08 | forward: 145922.67 | backward_microstep: 390744.80 | backward: 390730.87 | backward_inner_microstep: 390712.86 | backward_inner: 390705.96 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.91 | reduce_tied_grads: 0.37 | comms: 18.19 | reduce_grads: 0.23 | step: 364.25 | _step_clipping: 0.15 | _step_step: 362.12 | _step_zero_grad: 0.53 | _step_check_overflow: 0.82 samples/sec: 16.063 | iteration 21940/ 143000 | elapsed time per iteration (ms): 63748.2 | learning rate: 5.701E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.356709E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 23:20:27,671] [INFO] [logging.py:60:log_dist] [Rank 0] step=21950, skipped=24, lr=[0.0005700975437257092, 0.0005700975437257092], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21950 loss: 2.3190 iter time (s): 64.010 samples/sec: 15.997 %comms: 0.0029103673270898136 %optimizer_step 0.05711161486501923 %forward: 22.799332870968435 %backward: 61.04481825001873 [2025-04-09 23:20:27,671] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29992.19 | forward: 145938.93 | backward_microstep: 390767.57 | backward: 390748.96 | backward_inner_microstep: 390730.89 | backward_inner: 390723.82 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.97 | reduce_tied_grads: 0.39 | comms: 18.63 | reduce_grads: 0.26 | step: 365.57 | _step_clipping: 0.15 | _step_step: 363.59 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 15.997 | iteration 21950/ 143000 | elapsed time per iteration (ms): 64010.9 | learning rate: 5.701E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.346924E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 23:31:07,548] [INFO] [logging.py:60:log_dist] [Rank 0] step=21960, skipped=24, lr=[0.0005700688530666499, 0.0005700688530666499], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21960 loss: 2.3478 iter time (s): 63.987 samples/sec: 16.003 %comms: 0.0029104092506042367 %optimizer_step 0.05787276695967225 %forward: 22.785107923901275 %backward: 61.051552942751854 [2025-04-09 23:31:07,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30000.98 | forward: 145795.38 | backward_microstep: 390667.18 | backward: 390651.40 | backward_inner_microstep: 390633.47 | backward_inner: 390626.25 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.86 | reduce_tied_grads: 0.36 | comms: 18.62 | reduce_grads: 0.25 | step: 370.31 | _step_clipping: 0.15 | _step_step: 368.05 | _step_zero_grad: 0.57 | _step_check_overflow: 0.88 samples/sec: 16.003 | iteration 21960/ 143000 | elapsed time per iteration (ms): 63987.8 | learning rate: 5.701E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.344959E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 23:41:37,599] [INFO] [logging.py:60:log_dist] [Rank 0] step=21970, skipped=24, lr=[0.0005700401493728568, 0.0005700401493728568], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21970 loss: 2.3801 iter time (s): 63.004 samples/sec: 16.253 %comms: 0.0029350674752266826 %optimizer_step 0.05706750208813919 %forward: 23.1498156100867 %backward: 62.01469939098859 [2025-04-09 23:41:37,599] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20068.92 | forward: 145854.07 | backward_microstep: 390736.54 | backward: 390720.00 | backward_inner_microstep: 390700.32 | backward_inner: 390693.10 | backward_allreduce_microstep: 8.45 | backward_allreduce: 3.04 | reduce_tied_grads: 0.71 | comms: 18.49 | reduce_grads: 0.20 | step: 359.55 | _step_clipping: 0.11 | _step_step: 357.66 | _step_zero_grad: 0.54 | _step_check_overflow: 0.61 samples/sec: 16.253 | iteration 21970/ 143000 | elapsed time per iteration (ms): 63005.0 | learning rate: 5.700E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.348733E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-09 23:52:08,484] [INFO] [logging.py:60:log_dist] [Rank 0] step=21980, skipped=24, lr=[0.0005700114326457153, 0.0005700114326457153], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21980 loss: 2.3397 iter time (s): 63.088 samples/sec: 16.231 %comms: 0.0029620153872258873 %optimizer_step 0.05659914145573208 %forward: 23.07994914584872 %backward: 61.91389940598935 [2025-04-09 23:52:08,485] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21281.89 | forward: 145606.85 | backward_microstep: 390617.16 | backward: 390602.59 | backward_inner_microstep: 390585.18 | backward_inner: 390578.46 | backward_allreduce_microstep: 8.03 | backward_allreduce: 2.77 | reduce_tied_grads: 0.30 | comms: 18.69 | reduce_grads: 0.19 | step: 357.07 | _step_clipping: 0.12 | _step_step: 355.26 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.231 | iteration 21980/ 143000 | elapsed time per iteration (ms): 63088.6 | learning rate: 5.700E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.355431E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 00:02:36,859] [INFO] [logging.py:60:log_dist] [Rank 0] step=21990, skipped=24, lr=[0.0005699827028866111, 0.0005699827028866111], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 21990 loss: 2.3657 iter time (s): 62.837 samples/sec: 16.296 %comms: 0.0029550322578070463 %optimizer_step 0.05780752923232898 %forward: 23.17836093218699 %backward: 62.17673148167182 [2025-04-10 00:02:36,860] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18600.05 | forward: 145645.71 | backward_microstep: 390718.56 | backward: 390699.50 | backward_inner_microstep: 390679.79 | backward_inner: 390672.68 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.86 | reduce_tied_grads: 0.37 | comms: 18.57 | reduce_grads: 0.22 | step: 363.24 | _step_clipping: 0.12 | _step_step: 361.04 | _step_zero_grad: 0.57 | _step_check_overflow: 0.83 samples/sec: 16.296 | iteration 21990/ 143000 | elapsed time per iteration (ms): 62837.6 | learning rate: 5.700E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.350688E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 00:13:06,605] [INFO] [logging.py:60:log_dist] [Rank 0] step=22000, skipped=24, lr=[0.0005699539600969312, 0.0005699539600969312], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22000 loss: 2.3681 iter time (s): 62.974 samples/sec: 16.261 %comms: 0.00291165431086956 %optimizer_step 0.0562935608397674 %forward: 23.166338325516328 %backward: 62.048724082705974 [2025-04-10 00:13:06,606] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19632.78 | forward: 145887.44 | backward_microstep: 390762.69 | backward: 390744.94 | backward_inner_microstep: 390726.97 | backward_inner: 390719.79 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.31 | comms: 18.34 | reduce_grads: 0.20 | step: 354.50 | _step_clipping: 0.13 | _step_step: 352.71 | _step_zero_grad: 0.52 | _step_check_overflow: 0.50 samples/sec: 16.261 | iteration 22000/ 143000 | elapsed time per iteration (ms): 62974.6 | learning rate: 5.700E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.363005E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 00:13:09,477] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step22000/mp_rank_00_model_states.pt [2025-04-10 00:13:23,907] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-10 00:13:23,913] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step22000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-10 00:23:57,739] [INFO] [logging.py:60:log_dist] [Rank 0] step=22010, skipped=24, lr=[0.0005699252042780626, 0.0005699252042780626], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22010 loss: 2.3526 iter time (s): 63.381 samples/sec: 16.156 %comms: 0.0028835735296442295 %optimizer_step 0.05712396139315108 %forward: 22.999800204470066 %backward: 61.63252326885228 [2025-04-10 00:23:57,739] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23930.68 | forward: 145775.64 | backward_microstep: 390647.39 | backward: 390634.71 | backward_inner_microstep: 390617.46 | backward_inner: 390610.63 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.85 | reduce_tied_grads: 0.37 | comms: 18.28 | reduce_grads: 0.21 | step: 362.06 | _step_clipping: 0.18 | _step_step: 360.16 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 15.726 | iteration 22010/ 143000 | elapsed time per iteration (ms): 65113.3 | learning rate: 5.699E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.357097E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 00:34:37,911] [INFO] [logging.py:60:log_dist] [Rank 0] step=22020, skipped=24, lr=[0.0005698964354313933, 0.0005698964354313933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22020 loss: 2.3410 iter time (s): 64.017 samples/sec: 15.996 %comms: 0.002817971947860194 %optimizer_step 0.056922407660469035 %forward: 22.796082059159886 %backward: 61.02128480199778 [2025-04-10 00:34:37,912] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30141.24 | forward: 145932.81 | backward_microstep: 390650.58 | backward: 390637.63 | backward_inner_microstep: 390620.04 | backward_inner: 390613.25 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.85 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.22 | step: 364.40 | _step_clipping: 0.13 | _step_step: 362.51 | _step_zero_grad: 0.54 | _step_check_overflow: 0.56 samples/sec: 15.996 | iteration 22020/ 143000 | elapsed time per iteration (ms): 64017.2 | learning rate: 5.699E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.342047E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 00:45:13,075] [INFO] [logging.py:60:log_dist] [Rank 0] step=22030, skipped=24, lr=[0.0005698676535583117, 0.0005698676535583117], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22030 loss: 2.3360 iter time (s): 63.516 samples/sec: 16.122 %comms: 0.0029286623536741675 %optimizer_step 0.05912717832728899 %forward: 22.959659118431492 %backward: 61.53585861509278 [2025-04-10 00:45:13,076] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24986.17 | forward: 145830.29 | backward_microstep: 390871.46 | backward: 390850.41 | backward_inner_microstep: 390828.13 | backward_inner: 390817.40 | backward_allreduce_microstep: 8.92 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.60 | reduce_grads: 0.23 | step: 375.55 | _step_clipping: 0.14 | _step_step: 373.38 | _step_zero_grad: 0.61 | _step_check_overflow: 0.73 samples/sec: 16.122 | iteration 22030/ 143000 | elapsed time per iteration (ms): 63516.5 | learning rate: 5.699E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.342870E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 00:55:58,123] [INFO] [logging.py:60:log_dist] [Rank 0] step=22040, skipped=24, lr=[0.000569838858660207, 0.000569838858660207], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22040 loss: 2.3582 iter time (s): 64.504 samples/sec: 15.875 %comms: 0.002893035159878217 %optimizer_step 0.05894187210382077 %forward: 22.629396274743105 %backward: 60.62310793824327 [2025-04-10 00:55:58,124] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34487.62 | forward: 145968.87 | backward_microstep: 391064.66 | backward: 391043.85 | backward_inner_microstep: 391025.24 | backward_inner: 391015.65 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.98 | reduce_tied_grads: 0.37 | comms: 18.66 | reduce_grads: 0.21 | step: 380.20 | _step_clipping: 0.13 | _step_step: 378.08 | _step_zero_grad: 0.61 | _step_check_overflow: 0.70 samples/sec: 15.875 | iteration 22040/ 143000 | elapsed time per iteration (ms): 64504.8 | learning rate: 5.698E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.345003E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 01:06:34,615] [INFO] [logging.py:60:log_dist] [Rank 0] step=22050, skipped=24, lr=[0.000569810050738469, 0.000569810050738469], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22050 loss: 2.3496 iter time (s): 63.649 samples/sec: 16.088 %comms: 0.0028827377957261605 %optimizer_step 0.05611412176602115 %forward: 22.888537178828887 %backward: 61.358661525550275 [2025-04-10 01:06:34,616] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26834.72 | forward: 145682.29 | backward_microstep: 390553.57 | backward: 390539.17 | backward_inner_microstep: 390519.07 | backward_inner: 390510.29 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.93 | reduce_tied_grads: 0.38 | comms: 18.35 | reduce_grads: 0.41 | step: 357.16 | _step_clipping: 0.14 | _step_step: 355.24 | _step_zero_grad: 0.54 | _step_check_overflow: 0.61 samples/sec: 16.088 | iteration 22050/ 143000 | elapsed time per iteration (ms): 63649.2 | learning rate: 5.698E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.341695E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 01:17:15,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=22060, skipped=24, lr=[0.0005697812297944882, 0.0005697812297944882], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22060 loss: 2.3501 iter time (s): 64.119 samples/sec: 15.970 %comms: 0.0028424914182164783 %optimizer_step 0.056496027516081425 %forward: 22.719982109211394 %backward: 60.89574832478581 [2025-04-10 01:17:15,817] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31644.15 | forward: 145679.35 | backward_microstep: 390473.33 | backward: 390460.40 | backward_inner_microstep: 390442.44 | backward_inner: 390435.48 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.23 | reduce_grads: 0.21 | step: 362.25 | _step_clipping: 0.12 | _step_step: 360.22 | _step_zero_grad: 0.55 | _step_check_overflow: 0.74 samples/sec: 15.970 | iteration 22060/ 143000 | elapsed time per iteration (ms): 64120.1 | learning rate: 5.698E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.342867E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 01:28:00,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=22070, skipped=24, lr=[0.0005697523958296554, 0.0005697523958296554], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22070 loss: 2.3538 iter time (s): 64.418 samples/sec: 15.896 %comms: 0.0028003045139533425 %optimizer_step 0.055480383173474895 %forward: 22.633958127913147 %backward: 60.62590979640047 [2025-04-10 01:28:00,002] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34414.65 | forward: 145803.32 | backward_microstep: 390557.63 | backward: 390539.69 | backward_inner_microstep: 390521.53 | backward_inner: 390514.50 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.21 | step: 357.39 | _step_clipping: 0.12 | _step_step: 355.56 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 15.896 | iteration 22070/ 143000 | elapsed time per iteration (ms): 64418.5 | learning rate: 5.698E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.349348E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 01:38:42,306] [INFO] [logging.py:60:log_dist] [Rank 0] step=22080, skipped=24, lr=[0.0005697235488453624, 0.0005697235488453624], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22080 loss: 2.3384 iter time (s): 64.230 samples/sec: 15.943 %comms: 0.002879111318429134 %optimizer_step 0.05680709386273018 %forward: 22.707315592757656 %backward: 60.812931882218415 [2025-04-10 01:38:42,306] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32427.67 | forward: 145848.52 | backward_microstep: 390615.40 | backward: 390599.95 | backward_inner_microstep: 390581.89 | backward_inner: 390574.77 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.49 | reduce_grads: 0.23 | step: 364.87 | _step_clipping: 0.15 | _step_step: 362.99 | _step_zero_grad: 0.57 | _step_check_overflow: 0.49 samples/sec: 15.943 | iteration 22080/ 143000 | elapsed time per iteration (ms): 64230.4 | learning rate: 5.697E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.343636E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 01:49:28,840] [INFO] [logging.py:60:log_dist] [Rank 0] step=22090, skipped=24, lr=[0.0005696946888430014, 0.0005696946888430014], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22090 loss: 2.3424 iter time (s): 64.653 samples/sec: 15.838 %comms: 0.0028651758784410987 %optimizer_step 0.05683835425886193 %forward: 22.541566188911755 %backward: 60.419590529586095 [2025-04-10 01:49:28,841] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36726.08 | forward: 145737.58 | backward_microstep: 390645.92 | backward: 390629.69 | backward_inner_microstep: 390611.28 | backward_inner: 390604.08 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.95 | reduce_tied_grads: 0.38 | comms: 18.52 | reduce_grads: 0.24 | step: 367.48 | _step_clipping: 0.15 | _step_step: 365.33 | _step_zero_grad: 0.62 | _step_check_overflow: 0.69 samples/sec: 15.838 | iteration 22090/ 143000 | elapsed time per iteration (ms): 64653.5 | learning rate: 5.697E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.337216E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 02:00:15,635] [INFO] [logging.py:60:log_dist] [Rank 0] step=22100, skipped=24, lr=[0.0005696658158239655, 0.0005696658158239655], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22100 loss: 2.3429 iter time (s): 64.679 samples/sec: 15.832 %comms: 0.0028435994534164078 %optimizer_step 0.056841578014050737 %forward: 22.548059814732245 %backward: 60.41564778422681 [2025-04-10 02:00:15,636] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36734.31 | forward: 145838.35 | backward_microstep: 390780.95 | backward: 390761.71 | backward_inner_microstep: 390743.49 | backward_inner: 390736.22 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.34 | comms: 18.39 | reduce_grads: 0.21 | step: 367.65 | _step_clipping: 0.13 | _step_step: 365.58 | _step_zero_grad: 0.60 | _step_check_overflow: 0.65 samples/sec: 15.832 | iteration 22100/ 143000 | elapsed time per iteration (ms): 64679.5 | learning rate: 5.697E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.341794E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 02:10:56,445] [INFO] [logging.py:60:log_dist] [Rank 0] step=22110, skipped=24, lr=[0.000569636929789648, 0.000569636929789648], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22110 loss: 2.3449 iter time (s): 64.080 samples/sec: 15.980 %comms: 0.0028889848699824 %optimizer_step 0.05698006234368182 %forward: 22.74534250186038 %backward: 60.95811403269682 [2025-04-10 02:10:56,446] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30994.57 | forward: 145753.03 | backward_microstep: 390635.97 | backward: 390621.93 | backward_inner_microstep: 390603.96 | backward_inner: 390596.57 | backward_allreduce_microstep: 8.54 | backward_allreduce: 3.02 | reduce_tied_grads: 0.35 | comms: 18.51 | reduce_grads: 0.21 | step: 365.13 | _step_clipping: 0.13 | _step_step: 363.25 | _step_zero_grad: 0.50 | _step_check_overflow: 0.64 samples/sec: 15.980 | iteration 22110/ 143000 | elapsed time per iteration (ms): 64081.0 | learning rate: 5.696E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.334578E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 02:21:29,129] [INFO] [logging.py:60:log_dist] [Rank 0] step=22120, skipped=24, lr=[0.0005696080307414432, 0.0005696080307414432], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22120 loss: 2.3401 iter time (s): 63.268 samples/sec: 16.185 %comms: 0.0029129814129311054 %optimizer_step 0.058175027440096484 %forward: 23.023869360104896 %backward: 61.721729617511436 [2025-04-10 02:21:29,130] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23079.61 | forward: 145666.67 | backward_microstep: 390511.39 | backward: 390499.03 | backward_inner_microstep: 390480.90 | backward_inner: 390473.87 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.98 | reduce_tied_grads: 0.34 | comms: 18.43 | reduce_grads: 0.22 | step: 368.06 | _step_clipping: 0.13 | _step_step: 365.85 | _step_zero_grad: 0.76 | _step_check_overflow: 0.65 samples/sec: 16.185 | iteration 22120/ 143000 | elapsed time per iteration (ms): 63268.3 | learning rate: 5.696E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.344127E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 02:32:08,607] [INFO] [logging.py:60:log_dist] [Rank 0] step=22130, skipped=24, lr=[0.000569579118680746, 0.000569579118680746], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22130 loss: 2.3408 iter time (s): 63.947 samples/sec: 16.013 %comms: 0.0028473161158733807 %optimizer_step 0.05676777085183474 %forward: 22.76781665928968 %backward: 61.054116222679 [2025-04-10 02:32:08,608] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30054.01 | forward: 145593.81 | backward_microstep: 390443.14 | backward: 390423.96 | backward_inner_microstep: 390405.92 | backward_inner: 390399.03 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.21 | reduce_grads: 0.22 | step: 363.01 | _step_clipping: 0.12 | _step_step: 360.98 | _step_zero_grad: 0.55 | _step_check_overflow: 0.74 samples/sec: 16.013 | iteration 22130/ 143000 | elapsed time per iteration (ms): 63947.8 | learning rate: 5.696E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.341073E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 02:42:52,559] [INFO] [logging.py:60:log_dist] [Rank 0] step=22140, skipped=24, lr=[0.0005695501936089515, 0.0005695501936089515], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22140 loss: 2.3158 iter time (s): 64.395 samples/sec: 15.902 %comms: 0.0028468252342719095 %optimizer_step 0.05630423476053688 %forward: 22.63956054028701 %backward: 60.6337193806563 [2025-04-10 02:42:52,560] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34331.75 | forward: 145786.44 | backward_microstep: 390461.56 | backward: 390448.14 | backward_inner_microstep: 390428.21 | backward_inner: 390421.27 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.91 | reduce_tied_grads: 0.36 | comms: 18.33 | reduce_grads: 0.23 | step: 362.57 | _step_clipping: 0.14 | _step_step: 360.61 | _step_zero_grad: 0.56 | _step_check_overflow: 0.60 samples/sec: 15.902 | iteration 22140/ 143000 | elapsed time per iteration (ms): 64395.2 | learning rate: 5.696E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.338403E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 02:53:30,176] [INFO] [logging.py:60:log_dist] [Rank 0] step=22150, skipped=24, lr=[0.0005695212555274561, 0.0005695212555274561], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22150 loss: 2.3496 iter time (s): 63.761 samples/sec: 16.060 %comms: 0.0028701725782706014 %optimizer_step 0.05731064341070154 %forward: 22.858968323141298 %backward: 61.26173802120745 [2025-04-10 02:53:30,177] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27807.14 | forward: 145751.27 | backward_microstep: 390630.64 | backward: 390611.52 | backward_inner_microstep: 390590.04 | backward_inner: 390581.21 | backward_allreduce_microstep: 11.82 | backward_allreduce: 4.65 | reduce_tied_grads: 0.34 | comms: 18.30 | reduce_grads: 0.20 | step: 365.42 | _step_clipping: 0.13 | _step_step: 363.66 | _step_zero_grad: 0.51 | _step_check_overflow: 0.50 samples/sec: 16.060 | iteration 22150/ 143000 | elapsed time per iteration (ms): 63761.7 | learning rate: 5.695E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.338918E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 03:04:08,795] [INFO] [logging.py:60:log_dist] [Rank 0] step=22160, skipped=24, lr=[0.0005694923044376563, 0.0005694923044376563], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22160 loss: 2.3552 iter time (s): 63.861 samples/sec: 16.035 %comms: 0.002855549894557718 %optimizer_step 0.05847480138257373 %forward: 22.81222750623834 %backward: 61.15915208777535 [2025-04-10 03:04:08,796] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28925.10 | forward: 145681.92 | backward_microstep: 390585.58 | backward: 390570.49 | backward_inner_microstep: 390553.13 | backward_inner: 390546.19 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.83 | reduce_tied_grads: 0.34 | comms: 18.24 | reduce_grads: 0.20 | step: 373.43 | _step_clipping: 0.13 | _step_step: 371.41 | _step_zero_grad: 0.62 | _step_check_overflow: 0.63 samples/sec: 16.035 | iteration 22160/ 143000 | elapsed time per iteration (ms): 63862.0 | learning rate: 5.695E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.352156E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 03:14:57,458] [INFO] [logging.py:60:log_dist] [Rank 0] step=22170, skipped=24, lr=[0.0005694633403409494, 0.0005694633403409494], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22170 loss: 2.3455 iter time (s): 64.866 samples/sec: 15.786 %comms: 0.002832251881334045 %optimizer_step 0.057423120320773 %forward: 22.493833684541485 %backward: 60.22259242248475 [2025-04-10 03:14:57,459] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38639.27 | forward: 145907.68 | backward_microstep: 390653.67 | backward: 390637.68 | backward_inner_microstep: 390617.36 | backward_inner: 390610.23 | backward_allreduce_microstep: 10.42 | backward_allreduce: 4.79 | reduce_tied_grads: 0.32 | comms: 18.37 | reduce_grads: 0.20 | step: 372.48 | _step_clipping: 0.14 | _step_step: 370.49 | _step_zero_grad: 0.55 | _step_check_overflow: 0.67 samples/sec: 15.786 | iteration 22170/ 143000 | elapsed time per iteration (ms): 64866.3 | learning rate: 5.695E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.353417E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 03:25:39,265] [INFO] [logging.py:60:log_dist] [Rank 0] step=22180, skipped=24, lr=[0.0005694343632387334, 0.0005694343632387334], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22180 loss: 2.3380 iter time (s): 64.180 samples/sec: 15.955 %comms: 0.002852734932150848 %optimizer_step 0.057230447591020105 %forward: 22.708529669831716 %backward: 60.81712877851485 [2025-04-10 03:25:39,265] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32293.11 | forward: 145743.54 | backward_microstep: 390339.03 | backward: 390324.87 | backward_inner_microstep: 390307.08 | backward_inner: 390298.39 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.87 | reduce_tied_grads: 0.36 | comms: 18.31 | reduce_grads: 0.20 | step: 367.31 | _step_clipping: 0.14 | _step_step: 365.21 | _step_zero_grad: 0.50 | _step_check_overflow: 0.85 samples/sec: 15.955 | iteration 22180/ 143000 | elapsed time per iteration (ms): 64180.7 | learning rate: 5.694E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.343066E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 03:36:15,694] [INFO] [logging.py:60:log_dist] [Rank 0] step=22190, skipped=24, lr=[0.0005694053731324068, 0.0005694053731324068], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22190 loss: 2.3333 iter time (s): 63.642 samples/sec: 16.090 %comms: 0.0028403156467825554 %optimizer_step 0.05610097300321261 %forward: 22.859078318351838 %backward: 61.31996021009161 [2025-04-10 03:36:15,694] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27288.92 | forward: 145480.41 | backward_microstep: 390265.47 | backward: 390254.28 | backward_inner_microstep: 390235.14 | backward_inner: 390228.33 | backward_allreduce_microstep: 10.01 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.21 | step: 357.04 | _step_clipping: 0.13 | _step_step: 355.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.64 samples/sec: 16.090 | iteration 22190/ 143000 | elapsed time per iteration (ms): 63642.9 | learning rate: 5.694E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.339778E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 03:46:53,533] [INFO] [logging.py:60:log_dist] [Rank 0] step=22200, skipped=24, lr=[0.000569376370023369, 0.000569376370023369], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22200 loss: 2.3467 iter time (s): 63.783 samples/sec: 16.054 %comms: 0.002843942885866872 %optimizer_step 0.055850286599079764 %forward: 22.799436291742552 %backward: 61.177592686327586 [2025-04-10 03:46:53,533] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28809.15 | forward: 145422.28 | backward_microstep: 390226.39 | backward: 390210.76 | backward_inner_microstep: 390191.30 | backward_inner: 390182.65 | backward_allreduce_microstep: 10.16 | backward_allreduce: 2.94 | reduce_tied_grads: 0.35 | comms: 18.14 | reduce_grads: 0.22 | step: 356.23 | _step_clipping: 0.15 | _step_step: 354.38 | _step_zero_grad: 0.55 | _step_check_overflow: 0.52 samples/sec: 16.054 | iteration 22200/ 143000 | elapsed time per iteration (ms): 63783.9 | learning rate: 5.694E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.345963E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 03:57:33,110] [INFO] [logging.py:60:log_dist] [Rank 0] step=22210, skipped=24, lr=[0.0005693473539130195, 0.0005693473539130195], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22210 loss: 2.3475 iter time (s): 63.957 samples/sec: 16.011 %comms: 0.002859885499404173 %optimizer_step 0.055200621353609404 %forward: 22.766006982546998 %backward: 61.0150315321649 [2025-04-10 03:57:33,110] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30247.09 | forward: 145604.76 | backward_microstep: 390252.41 | backward: 390234.41 | backward_inner_microstep: 390216.65 | backward_inner: 390207.77 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.84 | reduce_tied_grads: 0.35 | comms: 18.29 | reduce_grads: 0.23 | step: 353.05 | _step_clipping: 0.13 | _step_step: 351.13 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 16.011 | iteration 22210/ 143000 | elapsed time per iteration (ms): 63957.7 | learning rate: 5.693E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.354928E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 04:08:14,631] [INFO] [logging.py:60:log_dist] [Rank 0] step=22220, skipped=24, lr=[0.0005693183248027591, 0.0005693183248027591], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22220 loss: 2.3293 iter time (s): 64.152 samples/sec: 15.962 %comms: 0.0028944373758819876 %optimizer_step 0.05736412818817621 %forward: 22.685948243147852 %backward: 60.83870066397886 [2025-04-10 04:08:14,632] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32179.61 | forward: 145533.98 | backward_microstep: 390300.89 | backward: 390289.97 | backward_inner_microstep: 390272.73 | backward_inner: 390266.02 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.83 | reduce_tied_grads: 0.33 | comms: 18.57 | reduce_grads: 0.22 | step: 368.00 | _step_clipping: 0.12 | _step_step: 366.00 | _step_zero_grad: 0.59 | _step_check_overflow: 0.64 samples/sec: 15.962 | iteration 22220/ 143000 | elapsed time per iteration (ms): 64152.2 | learning rate: 5.693E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.340007E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 04:18:54,230] [INFO] [logging.py:60:log_dist] [Rank 0] step=22230, skipped=24, lr=[0.0005692892826939885, 0.0005692892826939885], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22230 loss: 2.3395 iter time (s): 63.959 samples/sec: 16.010 %comms: 0.0028278060368415554 %optimizer_step 0.05740584178362525 %forward: 22.780382342912652 %backward: 61.02903007300037 [2025-04-10 04:18:54,230] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29993.33 | forward: 145701.60 | backward_microstep: 390348.91 | backward: 390337.06 | backward_inner_microstep: 390319.34 | backward_inner: 390312.47 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.96 | reduce_tied_grads: 0.33 | comms: 18.09 | reduce_grads: 0.20 | step: 367.16 | _step_clipping: 0.12 | _step_step: 365.23 | _step_zero_grad: 0.53 | _step_check_overflow: 0.68 samples/sec: 16.010 | iteration 22230/ 143000 | elapsed time per iteration (ms): 63959.8 | learning rate: 5.693E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.337106E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 04:22:07,176] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-10 04:23:09,944] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-10 04:29:30,029] [INFO] [logging.py:60:log_dist] [Rank 0] step=22240, skipped=26, lr=[0.0005692660396489868, 0.0005692660396489868], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22240 loss: 2.3724 iter time (s): 63.579 samples/sec: 16.106 %comms: 0.002337148354810062 %optimizer_step 0.04783033638371712 %forward: 23.579468302037977 %backward: 61.369848883729816 [2025-04-10 04:29:30,030] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22196.71 | forward: 149916.73 | backward_microstep: 390199.89 | backward: 390185.53 | backward_inner_microstep: 390165.73 | backward_inner: 390158.83 | backward_allreduce_microstep: 8.41 | backward_allreduce: 3.00 | reduce_tied_grads: 0.35 | comms: 14.86 | reduce_grads: 0.23 | step: 304.10 | _step_clipping: 0.14 | _step_step: 302.09 | _step_zero_grad: 0.54 | _step_check_overflow: 0.70 samples/sec: 16.106 | iteration 22240/ 143000 | elapsed time per iteration (ms): 63580.0 | learning rate: 5.693E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.352768E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-10 04:40:05,930] [INFO] [logging.py:60:log_dist] [Rank 0] step=22250, skipped=26, lr=[0.0005692369741464309, 0.0005692369741464309], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22250 loss: 2.3451 iter time (s): 63.589 samples/sec: 16.103 %comms: 0.0028674614892430805 %optimizer_step 0.057781754213004084 %forward: 22.90121842363867 %backward: 61.38507431850611 [2025-04-10 04:40:05,931] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26387.08 | forward: 145627.47 | backward_microstep: 390357.20 | backward: 390343.98 | backward_inner_microstep: 390325.96 | backward_inner: 390318.89 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.91 | reduce_tied_grads: 0.35 | comms: 18.23 | reduce_grads: 0.22 | step: 367.43 | _step_clipping: 0.14 | _step_step: 365.38 | _step_zero_grad: 0.63 | _step_check_overflow: 0.63 samples/sec: 16.103 | iteration 22250/ 143000 | elapsed time per iteration (ms): 63590.1 | learning rate: 5.692E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.348752E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 04:50:43,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=22260, skipped=26, lr=[0.0005692078956492915, 0.0005692078956492915], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22260 loss: 2.3561 iter time (s): 63.784 samples/sec: 16.054 %comms: 0.002876003311727046 %optimizer_step 0.056373171045778984 %forward: 22.824678759621538 %backward: 61.18772850002742 [2025-04-10 04:50:43,780] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28464.98 | forward: 145585.75 | backward_microstep: 390293.02 | backward: 390281.99 | backward_inner_microstep: 390262.80 | backward_inner: 390254.45 | backward_allreduce_microstep: 10.21 | backward_allreduce: 4.63 | reduce_tied_grads: 0.32 | comms: 18.34 | reduce_grads: 0.21 | step: 359.57 | _step_clipping: 0.16 | _step_step: 357.77 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.054 | iteration 22260/ 143000 | elapsed time per iteration (ms): 63785.0 | learning rate: 5.692E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.350668E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 05:01:18,775] [INFO] [logging.py:60:log_dist] [Rank 0] step=22270, skipped=26, lr=[0.0005691788041589717, 0.0005691788041589717], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22270 loss: 2.3451 iter time (s): 63.499 samples/sec: 16.126 %comms: 0.0028406079106685638 %optimizer_step 0.05627888011998385 %forward: 22.902909495327716 %backward: 61.444761381060054 [2025-04-10 05:01:18,776] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25895.11 | forward: 145431.04 | backward_microstep: 390178.15 | backward: 390167.69 | backward_inner_microstep: 390150.76 | backward_inner: 390144.23 | backward_allreduce_microstep: 8.05 | backward_allreduce: 2.78 | reduce_tied_grads: 0.33 | comms: 18.04 | reduce_grads: 0.19 | step: 357.36 | _step_clipping: 0.12 | _step_step: 355.50 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.126 | iteration 22270/ 143000 | elapsed time per iteration (ms): 63499.5 | learning rate: 5.692E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.347855E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 05:11:55,877] [INFO] [logging.py:60:log_dist] [Rank 0] step=22280, skipped=26, lr=[0.0005691496996768758, 0.0005691496996768758], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22280 loss: 2.3428 iter time (s): 63.710 samples/sec: 16.073 %comms: 0.0031026046653759363 %optimizer_step 0.05757663650481643 %forward: 22.84569551193792 %backward: 61.23166163920929 [2025-04-10 05:11:55,878] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27950.52 | forward: 145549.00 | backward_microstep: 390115.14 | backward: 390104.45 | backward_inner_microstep: 390085.27 | backward_inner: 390078.70 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 19.77 | reduce_grads: 0.23 | step: 366.82 | _step_clipping: 1.85 | _step_step: 363.39 | _step_zero_grad: 0.52 | _step_check_overflow: 0.48 samples/sec: 16.073 | iteration 22280/ 143000 | elapsed time per iteration (ms): 63710.2 | learning rate: 5.691E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.340520E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 05:22:35,533] [INFO] [logging.py:60:log_dist] [Rank 0] step=22290, skipped=26, lr=[0.0005691205822044086, 0.0005691205822044086], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22290 loss: 2.3235 iter time (s): 63.965 samples/sec: 16.009 %comms: 0.002820504982137041 %optimizer_step 0.057096114023365264 %forward: 22.763180122106945 %backward: 61.00650773816301 [2025-04-10 05:22:35,534] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30325.42 | forward: 145604.78 | backward_microstep: 390238.41 | backward: 390228.39 | backward_inner_microstep: 390210.47 | backward_inner: 390203.62 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.93 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.21 | step: 365.22 | _step_clipping: 0.12 | _step_step: 363.38 | _step_zero_grad: 0.59 | _step_check_overflow: 0.50 samples/sec: 16.009 | iteration 22290/ 143000 | elapsed time per iteration (ms): 63965.6 | learning rate: 5.691E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.338939E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 05:33:07,172] [INFO] [logging.py:60:log_dist] [Rank 0] step=22300, skipped=26, lr=[0.0005690914517429753, 0.0005690914517429753], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22300 loss: 2.3405 iter time (s): 63.163 samples/sec: 16.212 %comms: 0.0028691785089198776 %optimizer_step 0.05690932454894339 %forward: 23.0199865807813 %backward: 61.79594555090958 [2025-04-10 05:33:07,173] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22414.05 | forward: 145401.79 | backward_microstep: 390334.95 | backward: 390323.47 | backward_inner_microstep: 390305.89 | backward_inner: 390299.28 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.88 | reduce_tied_grads: 0.36 | comms: 18.12 | reduce_grads: 0.21 | step: 359.46 | _step_clipping: 0.13 | _step_step: 357.65 | _step_zero_grad: 0.54 | _step_check_overflow: 0.48 samples/sec: 16.212 | iteration 22300/ 143000 | elapsed time per iteration (ms): 63163.9 | learning rate: 5.691E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.338401E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 05:43:46,602] [INFO] [logging.py:60:log_dist] [Rank 0] step=22310, skipped=26, lr=[0.0005690623082939819, 0.0005690623082939819], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22310 loss: 2.3440 iter time (s): 63.942 samples/sec: 16.014 %comms: 0.002818852784971461 %optimizer_step 0.055476700696800836 %forward: 22.747843422414803 %backward: 61.03707846058415 [2025-04-10 05:43:46,603] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30215.21 | forward: 145455.36 | backward_microstep: 390298.87 | backward: 390286.23 | backward_inner_microstep: 390268.80 | backward_inner: 390260.60 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.21 | step: 354.73 | _step_clipping: 0.12 | _step_step: 352.90 | _step_zero_grad: 0.52 | _step_check_overflow: 0.61 samples/sec: 16.014 | iteration 22310/ 143000 | elapsed time per iteration (ms): 63943.1 | learning rate: 5.691E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.342635E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 05:54:22,907] [INFO] [logging.py:60:log_dist] [Rank 0] step=22320, skipped=26, lr=[0.0005690331518588349, 0.0005690331518588349], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22320 loss: 2.3227 iter time (s): 63.630 samples/sec: 16.093 %comms: 0.0028362998590212166 %optimizer_step 0.05783284956146806 %forward: 22.87286423657706 %backward: 61.35630998206678 [2025-04-10 05:54:22,908] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26840.80 | forward: 145539.69 | backward_microstep: 390420.53 | backward: 390409.27 | backward_inner_microstep: 390391.86 | backward_inner: 390385.16 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.82 | reduce_tied_grads: 0.36 | comms: 18.05 | reduce_grads: 0.21 | step: 367.99 | _step_clipping: 0.12 | _step_step: 366.03 | _step_zero_grad: 0.56 | _step_check_overflow: 0.67 samples/sec: 16.093 | iteration 22320/ 143000 | elapsed time per iteration (ms): 63630.5 | learning rate: 5.690E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.337750E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 06:04:59,298] [INFO] [logging.py:60:log_dist] [Rank 0] step=22330, skipped=26, lr=[0.0005690039824389416, 0.0005690039824389416], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22330 loss: 2.3293 iter time (s): 63.638 samples/sec: 16.091 %comms: 0.00283497907182191 %optimizer_step 0.05718302383765914 %forward: 22.877438906783574 %backward: 61.33421332288582 [2025-04-10 06:04:59,299] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27010.86 | forward: 145588.51 | backward_microstep: 390331.34 | backward: 390321.53 | backward_inner_microstep: 390304.81 | backward_inner: 390298.43 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.71 | reduce_tied_grads: 0.33 | comms: 18.04 | reduce_grads: 0.20 | step: 363.90 | _step_clipping: 0.12 | _step_step: 362.18 | _step_zero_grad: 0.53 | _step_check_overflow: 0.49 samples/sec: 16.091 | iteration 22330/ 143000 | elapsed time per iteration (ms): 63639.1 | learning rate: 5.690E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.337877E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 06:15:38,579] [INFO] [logging.py:60:log_dist] [Rank 0] step=22340, skipped=26, lr=[0.00056897480003571, 0.00056897480003571], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22340 loss: 2.3392 iter time (s): 63.928 samples/sec: 16.018 %comms: 0.0028473725240958027 %optimizer_step 0.05652694770892982 %forward: 22.77084750130871 %backward: 61.04817010272604 [2025-04-10 06:15:38,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29961.86 | forward: 145568.36 | backward_microstep: 390277.12 | backward: 390265.74 | backward_inner_microstep: 390248.26 | backward_inner: 390241.59 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.20 | reduce_grads: 0.23 | step: 361.36 | _step_clipping: 0.14 | _step_step: 359.55 | _step_zero_grad: 0.55 | _step_check_overflow: 0.50 samples/sec: 16.018 | iteration 22340/ 143000 | elapsed time per iteration (ms): 63928.3 | learning rate: 5.690E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.347234E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 06:26:13,366] [INFO] [logging.py:60:log_dist] [Rank 0] step=22350, skipped=26, lr=[0.0005689456046505483, 0.0005689456046505483], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22350 loss: 2.3475 iter time (s): 63.478 samples/sec: 16.132 %comms: 0.002850828446569975 %optimizer_step 0.056293326427023456 %forward: 22.90587844641711 %backward: 61.48125725087606 [2025-04-10 06:26:13,366] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25668.00 | forward: 145401.60 | backward_microstep: 390280.99 | backward: 390269.82 | backward_inner_microstep: 390252.70 | backward_inner: 390246.17 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.84 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.21 | step: 357.34 | _step_clipping: 0.12 | _step_step: 355.48 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.131 | iteration 22350/ 143000 | elapsed time per iteration (ms): 63478.5 | learning rate: 5.689E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.337850E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 06:36:51,865] [INFO] [logging.py:60:log_dist] [Rank 0] step=22360, skipped=26, lr=[0.0005689163962848657, 0.0005689163962848657], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22360 loss: 2.3463 iter time (s): 63.849 samples/sec: 16.038 %comms: 0.0028541493378117003 %optimizer_step 0.058434016745000476 %forward: 22.819621557923064 %backward: 61.14573012138159 [2025-04-10 06:36:51,865] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28859.68 | forward: 145701.53 | backward_microstep: 390422.75 | backward: 390410.79 | backward_inner_microstep: 390392.95 | backward_inner: 390386.11 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.91 | reduce_tied_grads: 0.38 | comms: 18.22 | reduce_grads: 0.21 | step: 373.10 | _step_clipping: 0.15 | _step_step: 371.00 | _step_zero_grad: 0.59 | _step_check_overflow: 0.71 samples/sec: 16.038 | iteration 22360/ 143000 | elapsed time per iteration (ms): 63849.9 | learning rate: 5.689E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.344230E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 06:47:26,190] [INFO] [logging.py:60:log_dist] [Rank 0] step=22370, skipped=26, lr=[0.0005688871749400721, 0.0005688871749400721], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22370 loss: 2.3425 iter time (s): 63.432 samples/sec: 16.143 %comms: 0.0028415052647775187 %optimizer_step 0.056844012317441774 %forward: 22.923622432805104 %backward: 61.536219628229375 [2025-04-10 06:47:26,191] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25094.89 | forward: 145408.88 | backward_microstep: 390347.44 | backward: 390335.90 | backward_inner_microstep: 390318.55 | backward_inner: 390311.86 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.82 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.20 | step: 360.57 | _step_clipping: 0.15 | _step_step: 358.75 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.143 | iteration 22370/ 143000 | elapsed time per iteration (ms): 63432.5 | learning rate: 5.689E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.338917E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 06:58:04,243] [INFO] [logging.py:60:log_dist] [Rank 0] step=22380, skipped=26, lr=[0.0005688579406175776, 0.0005688579406175776], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22380 loss: 2.3370 iter time (s): 63.805 samples/sec: 16.049 %comms: 0.0028766172475247855 %optimizer_step 0.05693245649081288 %forward: 22.821563472492308 %backward: 61.18124400549901 [2025-04-10 06:58:04,244] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28583.38 | forward: 145612.36 | backward_microstep: 390376.98 | backward: 390365.25 | backward_inner_microstep: 390347.32 | backward_inner: 390340.55 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.90 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.23 | step: 363.26 | _step_clipping: 0.13 | _step_step: 361.27 | _step_zero_grad: 0.53 | _step_check_overflow: 0.71 samples/sec: 16.049 | iteration 22380/ 143000 | elapsed time per iteration (ms): 63805.3 | learning rate: 5.689E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.338105E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 07:08:42,238] [INFO] [logging.py:60:log_dist] [Rank 0] step=22390, skipped=26, lr=[0.0005688286933187934, 0.0005688286933187934], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22390 loss: 2.3590 iter time (s): 63.799 samples/sec: 16.050 %comms: 0.002859880020327376 %optimizer_step 0.057656844938872985 %forward: 22.83979809290406 %backward: 61.195410012302055 [2025-04-10 07:08:42,238] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28351.40 | forward: 145715.22 | backward_microstep: 390436.83 | backward: 390419.49 | backward_inner_microstep: 390401.32 | backward_inner: 390394.39 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.22 | step: 367.84 | _step_clipping: 0.14 | _step_step: 365.80 | _step_zero_grad: 0.56 | _step_check_overflow: 0.70 samples/sec: 16.050 | iteration 22390/ 143000 | elapsed time per iteration (ms): 63799.4 | learning rate: 5.688E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.356386E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 07:19:21,583] [INFO] [logging.py:60:log_dist] [Rank 0] step=22400, skipped=26, lr=[0.0005687994330451309, 0.0005687994330451309], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22400 loss: 2.3506 iter time (s): 63.934 samples/sec: 16.017 %comms: 0.0028507811129061932 %optimizer_step 0.056746415381975085 %forward: 22.789982714509524 %backward: 61.06238648552156 [2025-04-10 07:19:21,584] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29723.14 | forward: 145705.18 | backward_microstep: 390408.78 | backward: 390395.46 | backward_inner_microstep: 390375.81 | backward_inner: 390368.74 | backward_allreduce_microstep: 10.20 | backward_allreduce: 4.65 | reduce_tied_grads: 0.33 | comms: 18.23 | reduce_grads: 0.21 | step: 362.80 | _step_clipping: 0.12 | _step_step: 360.91 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 16.016 | iteration 22400/ 143000 | elapsed time per iteration (ms): 63934.5 | learning rate: 5.688E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.347567E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 07:29:57,424] [INFO] [logging.py:60:log_dist] [Rank 0] step=22410, skipped=26, lr=[0.0005687701597980025, 0.0005687701597980025], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22410 loss: 2.3251 iter time (s): 63.584 samples/sec: 16.105 %comms: 0.0028356275395476195 %optimizer_step 0.05633656944481374 %forward: 22.893943442484613 %backward: 61.40596999365058 [2025-04-10 07:29:57,425] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26324.88 | forward: 145567.83 | backward_microstep: 390456.52 | backward: 390440.99 | backward_inner_microstep: 390422.96 | backward_inner: 390415.88 | backward_allreduce_microstep: 8.46 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.24 | step: 358.21 | _step_clipping: 0.14 | _step_step: 356.45 | _step_zero_grad: 0.51 | _step_check_overflow: 0.47 samples/sec: 16.105 | iteration 22410/ 143000 | elapsed time per iteration (ms): 63584.1 | learning rate: 5.688E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.344674E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 07:40:32,711] [INFO] [logging.py:60:log_dist] [Rank 0] step=22420, skipped=26, lr=[0.0005687408735788209, 0.0005687408735788209], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22420 loss: 2.3403 iter time (s): 63.528 samples/sec: 16.119 %comms: 0.002867450071515004 %optimizer_step 0.05620100810214653 %forward: 22.917365777677656 %backward: 61.461093716110526 [2025-04-10 07:40:32,712] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25734.34 | forward: 145589.72 | backward_microstep: 390470.48 | backward: 390450.78 | backward_inner_microstep: 390429.77 | backward_inner: 390421.06 | backward_allreduce_microstep: 11.62 | backward_allreduce: 2.93 | reduce_tied_grads: 0.30 | comms: 18.22 | reduce_grads: 0.19 | step: 357.03 | _step_clipping: 0.13 | _step_step: 355.17 | _step_zero_grad: 0.58 | _step_check_overflow: 0.52 samples/sec: 16.119 | iteration 22420/ 143000 | elapsed time per iteration (ms): 63528.7 | learning rate: 5.687E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.337953E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 07:51:05,332] [INFO] [logging.py:60:log_dist] [Rank 0] step=22430, skipped=26, lr=[0.0005687115743889998, 0.0005687115743889998], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22430 loss: 2.3486 iter time (s): 63.261 samples/sec: 16.187 %comms: 0.002855453709415157 %optimizer_step 0.05541327402584445 %forward: 23.01353260062108 %backward: 61.71467427957575 [2025-04-10 07:51:05,332] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23121.30 | forward: 145587.01 | backward_microstep: 390429.64 | backward: 390416.15 | backward_inner_microstep: 390398.55 | backward_inner: 390391.81 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.90 | reduce_tied_grads: 0.29 | comms: 18.06 | reduce_grads: 0.19 | step: 350.55 | _step_clipping: 0.12 | _step_step: 348.69 | _step_zero_grad: 0.51 | _step_check_overflow: 0.65 samples/sec: 16.187 | iteration 22430/ 143000 | elapsed time per iteration (ms): 63262.1 | learning rate: 5.687E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.340607E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 08:01:37,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=22440, skipped=26, lr=[0.0005686822622299531, 0.0005686822622299531], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22440 loss: 2.3371 iter time (s): 63.181 samples/sec: 16.207 %comms: 0.0029423787552877263 %optimizer_step 0.05999123602602827 %forward: 23.06223819331911 %backward: 61.820044365892755 [2025-04-10 08:01:37,148] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21960.55 | forward: 145709.28 | backward_microstep: 390603.74 | backward: 390584.56 | backward_inner_microstep: 390566.19 | backward_inner: 390558.95 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.98 | reduce_tied_grads: 0.36 | comms: 18.59 | reduce_grads: 0.22 | step: 379.03 | _step_clipping: 0.13 | _step_step: 376.65 | _step_zero_grad: 0.65 | _step_check_overflow: 0.91 samples/sec: 16.207 | iteration 22440/ 143000 | elapsed time per iteration (ms): 63181.5 | learning rate: 5.687E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.346098E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 08:12:13,229] [INFO] [logging.py:60:log_dist] [Rank 0] step=22450, skipped=26, lr=[0.0005686529371030955, 0.0005686529371030955], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22450 loss: 2.3519 iter time (s): 63.608 samples/sec: 16.099 %comms: 0.0028550597232352855 %optimizer_step 0.0579491663187001 %forward: 22.872380763730717 %backward: 61.35990263881263 [2025-04-10 08:12:13,230] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26807.42 | forward: 145485.67 | backward_microstep: 390309.53 | backward: 390295.47 | backward_inner_microstep: 390277.55 | backward_inner: 390270.34 | backward_allreduce_microstep: 8.45 | backward_allreduce: 2.90 | reduce_tied_grads: 0.37 | comms: 18.16 | reduce_grads: 0.24 | step: 368.60 | _step_clipping: 0.13 | _step_step: 366.68 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 16.099 | iteration 22450/ 143000 | elapsed time per iteration (ms): 63608.2 | learning rate: 5.687E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.352782E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 08:22:56,304] [INFO] [logging.py:60:log_dist] [Rank 0] step=22460, skipped=26, lr=[0.0005686235990098426, 0.0005686235990098426], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22460 loss: 2.3681 iter time (s): 64.307 samples/sec: 15.924 %comms: 0.0028784438639667923 %optimizer_step 0.05757595864127224 %forward: 22.677741901905108 %backward: 60.751181049286295 [2025-04-10 08:22:56,305] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33009.73 | forward: 145833.22 | backward_microstep: 390693.78 | backward: 390671.20 | backward_inner_microstep: 390652.15 | backward_inner: 390644.20 | backward_allreduce_microstep: 8.72 | backward_allreduce: 3.01 | reduce_tied_grads: 0.40 | comms: 18.51 | reduce_grads: 0.24 | step: 370.25 | _step_clipping: 0.13 | _step_step: 368.01 | _step_zero_grad: 0.61 | _step_check_overflow: 0.79 samples/sec: 15.923 | iteration 22460/ 143000 | elapsed time per iteration (ms): 64307.5 | learning rate: 5.686E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.344955E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 08:33:38,006] [INFO] [logging.py:60:log_dist] [Rank 0] step=22470, skipped=26, lr=[0.0005685942479516103, 0.0005685942479516103], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22470 loss: 2.3386 iter time (s): 64.169 samples/sec: 15.958 %comms: 0.0028524260305465027 %optimizer_step 0.0568588839926329 %forward: 22.725513930729015 %backward: 60.89028429734774 [2025-04-10 08:33:38,007] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31575.28 | forward: 145828.45 | backward_microstep: 390750.98 | backward: 390729.82 | backward_inner_microstep: 390710.92 | backward_inner: 390703.50 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.02 | reduce_tied_grads: 0.33 | comms: 18.30 | reduce_grads: 0.21 | step: 364.86 | _step_clipping: 0.14 | _step_step: 363.03 | _step_zero_grad: 0.53 | _step_check_overflow: 0.52 samples/sec: 15.958 | iteration 22470/ 143000 | elapsed time per iteration (ms): 64170.2 | learning rate: 5.686E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.341535E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 08:44:10,917] [INFO] [logging.py:60:log_dist] [Rank 0] step=22480, skipped=26, lr=[0.0005685648839298151, 0.0005685648839298151], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22480 loss: 2.3501 iter time (s): 63.290 samples/sec: 16.179 %comms: 0.0034769906714299153 %optimizer_step 0.058958159261700546 %forward: 23.01630854409883 %backward: 61.70875904164961 [2025-04-10 08:44:10,918] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23135.19 | forward: 145671.28 | backward_microstep: 390578.58 | backward: 390557.59 | backward_inner_microstep: 390539.80 | backward_inner: 390532.71 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.81 | reduce_tied_grads: 0.56 | comms: 22.01 | reduce_grads: 0.23 | step: 373.15 | _step_clipping: 0.17 | _step_step: 371.10 | _step_zero_grad: 0.60 | _step_check_overflow: 0.59 samples/sec: 16.179 | iteration 22480/ 143000 | elapsed time per iteration (ms): 63291.1 | learning rate: 5.686E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.339714E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 08:54:44,659] [INFO] [logging.py:60:log_dist] [Rank 0] step=22490, skipped=26, lr=[0.0005685355069458744, 0.0005685355069458744], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22490 loss: 2.3361 iter time (s): 63.372 samples/sec: 16.159 %comms: 0.002898406201267983 %optimizer_step 0.057259286132861076 %forward: 22.99169225511514 %backward: 61.56239387629975 [2025-04-10 08:54:44,659] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24410.40 | forward: 145702.85 | backward_microstep: 390144.02 | backward: 390132.94 | backward_inner_microstep: 390113.99 | backward_inner: 390107.34 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.80 | reduce_tied_grads: 0.33 | comms: 18.37 | reduce_grads: 0.24 | step: 362.86 | _step_clipping: 0.12 | _step_step: 361.05 | _step_zero_grad: 0.54 | _step_check_overflow: 0.51 samples/sec: 16.158 | iteration 22490/ 143000 | elapsed time per iteration (ms): 63374.1 | learning rate: 5.685E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.339677E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 09:05:12,357] [INFO] [logging.py:60:log_dist] [Rank 0] step=22500, skipped=26, lr=[0.0005685061170012059, 0.0005685061170012059], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22500 loss: 2.3485 iter time (s): 62.769 samples/sec: 16.314 %comms: 0.0028764381500878556 %optimizer_step 0.05654127288534221 %forward: 23.145454562008364 %backward: 62.157064529798625 [2025-04-10 09:05:12,357] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18825.34 | forward: 145282.40 | backward_microstep: 390168.51 | backward: 390155.54 | backward_inner_microstep: 390135.73 | backward_inner: 390129.29 | backward_allreduce_microstep: 7.75 | backward_allreduce: 2.67 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.19 | step: 354.91 | _step_clipping: 0.12 | _step_step: 353.07 | _step_zero_grad: 0.49 | _step_check_overflow: 0.63 samples/sec: 16.314 | iteration 22500/ 143000 | elapsed time per iteration (ms): 62769.8 | learning rate: 5.685E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.345259E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 09:15:44,241] [INFO] [logging.py:60:log_dist] [Rank 0] step=22510, skipped=26, lr=[0.0005684767140972282, 0.0005684767140972282], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22510 loss: 2.3400 iter time (s): 63.188 samples/sec: 16.206 %comms: 0.002896742041636288 %optimizer_step 0.05842550453185354 %forward: 23.0411333958962 %backward: 61.766749964489144 [2025-04-10 09:15:44,241] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22508.18 | forward: 145591.82 | backward_microstep: 390303.75 | backward: 390290.41 | backward_inner_microstep: 390272.69 | backward_inner: 390265.81 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.89 | reduce_tied_grads: 0.33 | comms: 18.30 | reduce_grads: 0.21 | step: 369.18 | _step_clipping: 0.13 | _step_step: 367.35 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.206 | iteration 22510/ 143000 | elapsed time per iteration (ms): 63188.4 | learning rate: 5.685E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.340627E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 09:26:14,347] [INFO] [logging.py:60:log_dist] [Rank 0] step=22520, skipped=26, lr=[0.0005684472982353604, 0.0005684472982353604], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22520 loss: 2.3776 iter time (s): 63.010 samples/sec: 16.251 %comms: 0.0028972298505082067 %optimizer_step 0.056271090998223576 %forward: 23.07913293881182 %backward: 61.91678253050912 [2025-04-10 09:26:14,348] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21110.46 | forward: 145421.83 | backward_microstep: 390149.35 | backward: 390138.22 | backward_inner_microstep: 390121.10 | backward_inner: 390112.76 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.82 | reduce_tied_grads: 0.29 | comms: 18.26 | reduce_grads: 0.20 | step: 354.56 | _step_clipping: 0.12 | _step_step: 352.79 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.251 | iteration 22520/ 143000 | elapsed time per iteration (ms): 63010.7 | learning rate: 5.684E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.345940E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 09:36:44,139] [INFO] [logging.py:60:log_dist] [Rank 0] step=22530, skipped=26, lr=[0.0005684178694170223, 0.0005684178694170223], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22530 loss: 2.3398 iter time (s): 62.979 samples/sec: 16.259 %comms: 0.0028744505351365156 %optimizer_step 0.0561318720964783 %forward: 23.0896552092892 %backward: 61.941463940337925 [2025-04-10 09:36:44,139] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20821.82 | forward: 145415.39 | backward_microstep: 390109.20 | backward: 390098.61 | backward_inner_microstep: 390079.81 | backward_inner: 390073.26 | backward_allreduce_microstep: 8.16 | backward_allreduce: 2.81 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.22 | step: 353.51 | _step_clipping: 0.12 | _step_step: 351.71 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 16.259 | iteration 22530/ 143000 | elapsed time per iteration (ms): 62979.2 | learning rate: 5.684E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.346623E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 09:47:17,701] [INFO] [logging.py:60:log_dist] [Rank 0] step=22540, skipped=26, lr=[0.0005683884276436341, 0.0005683884276436341], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22540 loss: 2.3451 iter time (s): 63.356 samples/sec: 16.163 %comms: 0.0028776663837047313 %optimizer_step 0.05653376922120899 %forward: 22.968157255994317 %backward: 61.63866384881177 [2025-04-10 09:47:17,702] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24017.33 | forward: 145516.16 | backward_microstep: 390531.11 | backward: 390515.50 | backward_inner_microstep: 390498.23 | backward_inner: 390491.32 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.76 | reduce_tied_grads: 0.31 | comms: 18.23 | reduce_grads: 0.19 | step: 358.17 | _step_clipping: 0.32 | _step_step: 356.22 | _step_zero_grad: 0.47 | _step_check_overflow: 0.58 samples/sec: 16.163 | iteration 22540/ 143000 | elapsed time per iteration (ms): 63356.2 | learning rate: 5.684E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.337411E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 09:57:48,974] [INFO] [logging.py:60:log_dist] [Rank 0] step=22550, skipped=26, lr=[0.000568358972916617, 0.000568358972916617], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22550 loss: 2.3402 iter time (s): 63.127 samples/sec: 16.221 %comms: 0.0029348181965917746 %optimizer_step 0.05815465418749509 %forward: 23.06211807543926 %backward: 61.85811560659492 [2025-04-10 09:57:48,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21672.81 | forward: 145583.66 | backward_microstep: 390505.98 | backward: 390490.19 | backward_inner_microstep: 390472.31 | backward_inner: 390463.29 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.86 | reduce_tied_grads: 0.35 | comms: 18.53 | reduce_grads: 0.23 | step: 367.11 | _step_clipping: 0.12 | _step_step: 365.13 | _step_zero_grad: 0.51 | _step_check_overflow: 0.71 samples/sec: 16.221 | iteration 22550/ 143000 | elapsed time per iteration (ms): 63127.4 | learning rate: 5.684E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.354930E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 10:08:24,667] [INFO] [logging.py:60:log_dist] [Rank 0] step=22560, skipped=26, lr=[0.0005683295052373924, 0.0005683295052373924], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22560 loss: 2.3403 iter time (s): 63.569 samples/sec: 16.109 %comms: 0.0029321578101052983 %optimizer_step 0.05899826013213932 %forward: 22.934152610557888 %backward: 61.42313416298636 [2025-04-10 10:08:24,668] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25890.67 | forward: 145789.27 | backward_microstep: 390474.93 | backward: 390458.46 | backward_inner_microstep: 390439.91 | backward_inner: 390432.59 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 18.64 | reduce_grads: 0.22 | step: 375.04 | _step_clipping: 0.14 | _step_step: 372.79 | _step_zero_grad: 0.62 | _step_check_overflow: 0.79 samples/sec: 16.108 | iteration 22560/ 143000 | elapsed time per iteration (ms): 63569.3 | learning rate: 5.683E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.356734E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 10:19:00,331] [INFO] [logging.py:60:log_dist] [Rank 0] step=22570, skipped=26, lr=[0.0005683000246073826, 0.0005683000246073826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22570 loss: 2.3277 iter time (s): 63.566 samples/sec: 16.109 %comms: 0.00291946792124209 %optimizer_step 0.05668314421132762 %forward: 22.913461927186656 %backward: 61.402399236868774 [2025-04-10 10:19:00,332] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26198.17 | forward: 145650.90 | backward_microstep: 390321.27 | backward: 390308.33 | backward_inner_microstep: 390290.67 | backward_inner: 390284.01 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.93 | reduce_tied_grads: 0.34 | comms: 18.56 | reduce_grads: 0.22 | step: 360.31 | _step_clipping: 0.14 | _step_step: 358.31 | _step_zero_grad: 0.53 | _step_check_overflow: 0.70 samples/sec: 16.109 | iteration 22570/ 143000 | elapsed time per iteration (ms): 63566.3 | learning rate: 5.683E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.343997E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 10:29:32,418] [INFO] [logging.py:60:log_dist] [Rank 0] step=22580, skipped=26, lr=[0.0005682705310280107, 0.0005682705310280107], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22580 loss: 2.3571 iter time (s): 63.208 samples/sec: 16.200 %comms: 0.002875780042653354 %optimizer_step 0.05742688435327929 %forward: 23.02177376296455 %backward: 61.7495618955186 [2025-04-10 10:29:32,419] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22748.72 | forward: 145516.35 | backward_microstep: 390320.35 | backward: 390307.50 | backward_inner_microstep: 390285.13 | backward_inner: 390278.22 | backward_allreduce_microstep: 12.57 | backward_allreduce: 4.81 | reduce_tied_grads: 0.37 | comms: 18.18 | reduce_grads: 0.22 | step: 362.98 | _step_clipping: 0.13 | _step_step: 361.09 | _step_zero_grad: 0.51 | _step_check_overflow: 0.65 samples/sec: 16.200 | iteration 22580/ 143000 | elapsed time per iteration (ms): 63208.7 | learning rate: 5.683E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.346030E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 10:40:02,565] [INFO] [logging.py:60:log_dist] [Rank 0] step=22590, skipped=26, lr=[0.0005682410245006999, 0.0005682410245006999], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22590 loss: 2.3293 iter time (s): 63.014 samples/sec: 16.250 %comms: 0.002850088691965548 %optimizer_step 0.05507752507794754 %forward: 23.077694547139053 %backward: 61.895538779151885 [2025-04-10 10:40:02,566] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21261.17 | forward: 145422.16 | backward_microstep: 390038.52 | backward: 390029.54 | backward_inner_microstep: 390011.77 | backward_inner: 390005.11 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.02 | reduce_tied_grads: 0.28 | comms: 17.96 | reduce_grads: 0.20 | step: 347.07 | _step_clipping: 0.12 | _step_step: 345.30 | _step_zero_grad: 0.47 | _step_check_overflow: 0.61 samples/sec: 16.250 | iteration 22590/ 143000 | elapsed time per iteration (ms): 63014.7 | learning rate: 5.682E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.337113E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 10:50:36,005] [INFO] [logging.py:60:log_dist] [Rank 0] step=22600, skipped=26, lr=[0.0005682115050268744, 0.0005682115050268744], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22600 loss: 2.3346 iter time (s): 63.342 samples/sec: 16.166 %comms: 0.003207316929864439 %optimizer_step 0.05817577798090338 %forward: 23.01276911194242 %backward: 61.65878986826758 [2025-04-10 10:50:36,006] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23553.10 | forward: 145768.19 | backward_microstep: 390580.45 | backward: 390561.00 | backward_inner_microstep: 390542.37 | backward_inner: 390534.65 | backward_allreduce_microstep: 8.65 | backward_allreduce: 2.98 | reduce_tied_grads: 0.36 | comms: 20.32 | reduce_grads: 1.96 | step: 368.50 | _step_clipping: 0.12 | _step_step: 366.65 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 16.166 | iteration 22600/ 143000 | elapsed time per iteration (ms): 63344.0 | learning rate: 5.682E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.339555E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 11:01:02,291] [INFO] [logging.py:60:log_dist] [Rank 0] step=22610, skipped=26, lr=[0.0005681819726079591, 0.0005681819726079591], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22610 loss: 2.3503 iter time (s): 62.628 samples/sec: 16.351 %comms: 0.0029182537241741052 %optimizer_step 0.05692210790298592 %forward: 23.234822273142264 %backward: 62.336001869655874 [2025-04-10 11:01:02,292] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16843.78 | forward: 145515.15 | backward_microstep: 390415.86 | backward: 390398.20 | backward_inner_microstep: 390380.15 | backward_inner: 390373.23 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.90 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.21 | step: 356.49 | _step_clipping: 0.12 | _step_step: 354.71 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.350 | iteration 22610/ 143000 | elapsed time per iteration (ms): 62628.6 | learning rate: 5.682E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.338653E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 11:11:32,165] [INFO] [logging.py:60:log_dist] [Rank 0] step=22620, skipped=26, lr=[0.0005681524272453791, 0.0005681524272453791], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22620 loss: 2.3623 iter time (s): 62.987 samples/sec: 16.257 %comms: 0.0028739990748514197 %optimizer_step 0.05698044686783438 %forward: 23.08662032519581 %backward: 61.95063765267995 [2025-04-10 11:11:32,166] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20760.88 | forward: 145415.29 | backward_microstep: 390220.77 | backward: 390207.39 | backward_inner_microstep: 390189.64 | backward_inner: 390182.86 | backward_allreduce_microstep: 8.44 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.10 | reduce_grads: 0.21 | step: 358.90 | _step_clipping: 0.11 | _step_step: 357.09 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.257 | iteration 22620/ 143000 | elapsed time per iteration (ms): 62987.4 | learning rate: 5.682E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.343269E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 11:22:01,221] [INFO] [logging.py:60:log_dist] [Rank 0] step=22630, skipped=26, lr=[0.0005681228689405606, 0.0005681228689405606], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22630 loss: 2.3451 iter time (s): 62.905 samples/sec: 16.278 %comms: 0.002883266865971094 %optimizer_step 0.05649255687650859 %forward: 23.12513693082454 %backward: 62.05298821326205 [2025-04-10 11:22:01,222] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19705.50 | forward: 145468.89 | backward_microstep: 390358.62 | backward: 390344.89 | backward_inner_microstep: 390326.73 | backward_inner: 390319.59 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.91 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.24 | step: 355.37 | _step_clipping: 0.14 | _step_step: 353.48 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.278 | iteration 22630/ 143000 | elapsed time per iteration (ms): 62905.7 | learning rate: 5.681E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.348478E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 11:32:38,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=22640, skipped=26, lr=[0.0005680932976949301, 0.0005680932976949301], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22640 loss: 2.3666 iter time (s): 63.679 samples/sec: 16.081 %comms: 0.002875202226809286 %optimizer_step 0.05515735864073007 %forward: 22.898302231930046 %backward: 61.31102985379274 [2025-04-10 11:32:38,014] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26995.99 | forward: 145813.12 | backward_microstep: 390438.05 | backward: 390419.90 | backward_inner_microstep: 390398.05 | backward_inner: 390391.05 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.20 | reduce_tied_grads: 0.31 | comms: 18.31 | reduce_grads: 0.21 | step: 351.23 | _step_clipping: 0.13 | _step_step: 349.35 | _step_zero_grad: 0.53 | _step_check_overflow: 0.63 samples/sec: 16.081 | iteration 22640/ 143000 | elapsed time per iteration (ms): 63679.2 | learning rate: 5.681E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.340897E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 11:43:14,966] [INFO] [logging.py:60:log_dist] [Rank 0] step=22650, skipped=26, lr=[0.0005680637135099149, 0.0005680637135099149], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22650 loss: 2.3464 iter time (s): 63.695 samples/sec: 16.077 %comms: 0.0029225046917933217 %optimizer_step 0.05618275333819754 %forward: 22.906718068282284 %backward: 61.32098627729792 [2025-04-10 11:43:14,966] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26892.92 | forward: 145903.36 | backward_microstep: 390599.65 | backward: 390581.41 | backward_inner_microstep: 390562.82 | backward_inner: 390555.25 | backward_allreduce_microstep: 8.74 | backward_allreduce: 3.09 | reduce_tied_grads: 0.33 | comms: 18.61 | reduce_grads: 0.21 | step: 357.85 | _step_clipping: 0.14 | _step_step: 355.95 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.077 | iteration 22650/ 143000 | elapsed time per iteration (ms): 63695.2 | learning rate: 5.681E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.343985E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 11:53:49,743] [INFO] [logging.py:60:log_dist] [Rank 0] step=22660, skipped=26, lr=[0.0005680341163869429, 0.0005680341163869429], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22660 loss: 2.3402 iter time (s): 63.477 samples/sec: 16.132 %comms: 0.0029122351790362455 %optimizer_step 0.05672608651563524 %forward: 22.910963930688773 %backward: 61.48059109209548 [2025-04-10 11:53:49,743] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25549.70 | forward: 145432.14 | backward_microstep: 390276.31 | backward: 390260.93 | backward_inner_microstep: 390243.58 | backward_inner: 390236.78 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.80 | reduce_tied_grads: 0.34 | comms: 18.49 | reduce_grads: 0.21 | step: 360.08 | _step_clipping: 0.13 | _step_step: 358.11 | _step_zero_grad: 0.57 | _step_check_overflow: 0.64 samples/sec: 16.132 | iteration 22660/ 143000 | elapsed time per iteration (ms): 63477.7 | learning rate: 5.680E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.340244E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 12:04:25,480] [INFO] [logging.py:60:log_dist] [Rank 0] step=22670, skipped=26, lr=[0.0005680045063274424, 0.0005680045063274424], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22670 loss: 2.3641 iter time (s): 63.573 samples/sec: 16.107 %comms: 0.002852519718638385 %optimizer_step 0.058294783176282505 %forward: 22.85890125314794 %backward: 61.35404635054381 [2025-04-10 12:04:25,480] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26873.23 | forward: 145321.15 | backward_microstep: 390057.71 | backward: 390046.77 | backward_inner_microstep: 390029.65 | backward_inner: 390022.83 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.81 | reduce_tied_grads: 0.32 | comms: 18.13 | reduce_grads: 0.19 | step: 370.60 | _step_clipping: 0.12 | _step_step: 368.77 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 16.107 | iteration 22670/ 143000 | elapsed time per iteration (ms): 63573.7 | learning rate: 5.680E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.339088E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 12:14:56,642] [INFO] [logging.py:60:log_dist] [Rank 0] step=22680, skipped=26, lr=[0.0005679748833328428, 0.0005679748833328428], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22680 loss: 2.3331 iter time (s): 63.116 samples/sec: 16.224 %comms: 0.0029275173649066363 %optimizer_step 0.05834874262998854 %forward: 23.03794194943736 %backward: 61.80441118946629 [2025-04-10 12:14:56,643] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22157.82 | forward: 145405.35 | backward_microstep: 390095.25 | backward: 390082.25 | backward_inner_microstep: 390064.81 | backward_inner: 390058.01 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.85 | reduce_tied_grads: 0.35 | comms: 18.48 | reduce_grads: 0.23 | step: 368.27 | _step_clipping: 0.13 | _step_step: 366.42 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.224 | iteration 22680/ 143000 | elapsed time per iteration (ms): 63116.2 | learning rate: 5.680E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.346224E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 12:25:38,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=22690, skipped=26, lr=[0.0005679452474045736, 0.0005679452474045736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22690 loss: 2.3467 iter time (s): 64.155 samples/sec: 15.961 %comms: 0.002854214179339367 %optimizer_step 0.05545528961080774 %forward: 22.70108633001558 %backward: 60.853096682931195 [2025-04-10 12:25:38,200] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31972.08 | forward: 145639.23 | backward_microstep: 390418.27 | backward: 390404.13 | backward_inner_microstep: 390384.19 | backward_inner: 390377.17 | backward_allreduce_microstep: 10.52 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.31 | reduce_grads: 0.23 | step: 355.77 | _step_clipping: 0.12 | _step_step: 353.93 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 15.961 | iteration 22690/ 143000 | elapsed time per iteration (ms): 64155.7 | learning rate: 5.679E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.339750E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 12:36:09,269] [INFO] [logging.py:60:log_dist] [Rank 0] step=22700, skipped=26, lr=[0.0005679155985440653, 0.0005679155985440653], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22700 loss: 2.3480 iter time (s): 63.106 samples/sec: 16.227 %comms: 0.002945290391124862 %optimizer_step 0.05744090762462819 %forward: 23.064254379309585 %backward: 61.87422413446644 [2025-04-10 12:36:09,270] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21488.39 | forward: 145549.96 | backward_microstep: 390480.15 | backward: 390465.29 | backward_inner_microstep: 390447.37 | backward_inner: 390440.56 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.59 | reduce_grads: 0.21 | step: 362.49 | _step_clipping: 0.13 | _step_step: 360.58 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.226 | iteration 22700/ 143000 | elapsed time per iteration (ms): 63106.9 | learning rate: 5.679E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.342153E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 12:46:44,738] [INFO] [logging.py:60:log_dist] [Rank 0] step=22710, skipped=26, lr=[0.0005678859367527489, 0.0005678859367527489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22710 loss: 2.3570 iter time (s): 63.546 samples/sec: 16.114 %comms: 0.0029188147973550527 %optimizer_step 0.05691486253167386 %forward: 22.91858548977782 %backward: 61.44434421176801 [2025-04-10 12:46:44,739] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25847.20 | forward: 145639.32 | backward_microstep: 390471.00 | backward: 390456.57 | backward_inner_microstep: 390438.97 | backward_inner: 390431.87 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.87 | reduce_tied_grads: 0.35 | comms: 18.55 | reduce_grads: 0.21 | step: 361.67 | _step_clipping: 0.13 | _step_step: 359.66 | _step_zero_grad: 0.60 | _step_check_overflow: 0.64 samples/sec: 16.114 | iteration 22710/ 143000 | elapsed time per iteration (ms): 63547.0 | learning rate: 5.679E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.360909E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 12:57:19,133] [INFO] [logging.py:60:log_dist] [Rank 0] step=22720, skipped=26, lr=[0.0005678562620320559, 0.0005678562620320559], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22720 loss: 2.3253 iter time (s): 63.439 samples/sec: 16.142 %comms: 0.002845139420266967 %optimizer_step 0.05619056398980675 %forward: 22.93985080581811 %backward: 61.496055086826985 [2025-04-10 12:57:19,134] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25239.60 | forward: 145527.80 | backward_microstep: 390134.64 | backward: 390123.96 | backward_inner_microstep: 390106.76 | backward_inner: 390100.12 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.83 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.22 | step: 356.47 | _step_clipping: 0.12 | _step_step: 354.70 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 16.141 | iteration 22720/ 143000 | elapsed time per iteration (ms): 63439.4 | learning rate: 5.679E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.339291E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 13:07:54,352] [INFO] [logging.py:60:log_dist] [Rank 0] step=22730, skipped=26, lr=[0.0005678265743834187, 0.0005678265743834187], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22730 loss: 2.3390 iter time (s): 63.521 samples/sec: 16.121 %comms: 0.0029259004254469935 %optimizer_step 0.056198652338761666 %forward: 22.9028127612676 %backward: 61.44538400797796 [2025-04-10 13:07:54,352] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25876.33 | forward: 145481.50 | backward_microstep: 390322.70 | backward: 390308.69 | backward_inner_microstep: 390291.20 | backward_inner: 390284.40 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.59 | reduce_grads: 0.22 | step: 356.98 | _step_clipping: 0.12 | _step_step: 355.21 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.120 | iteration 22730/ 143000 | elapsed time per iteration (ms): 63521.8 | learning rate: 5.678E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.334879E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 13:18:26,674] [INFO] [logging.py:60:log_dist] [Rank 0] step=22740, skipped=26, lr=[0.0005677968738082699, 0.0005677968738082699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22740 loss: 2.3436 iter time (s): 63.232 samples/sec: 16.194 %comms: 0.0028879434823849913 %optimizer_step 0.05667110223696078 %forward: 23.006773774901962 %backward: 61.716232793980986 [2025-04-10 13:18:26,675] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23091.23 | forward: 145475.73 | backward_microstep: 390257.07 | backward: 390242.19 | backward_inner_microstep: 390224.17 | backward_inner: 390217.03 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.21 | step: 358.34 | _step_clipping: 0.15 | _step_step: 356.47 | _step_zero_grad: 0.50 | _step_check_overflow: 0.60 samples/sec: 16.194 | iteration 22740/ 143000 | elapsed time per iteration (ms): 63232.3 | learning rate: 5.678E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.344091E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 13:28:59,718] [INFO] [logging.py:60:log_dist] [Rank 0] step=22750, skipped=26, lr=[0.0005677671603080432, 0.0005677671603080432], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22750 loss: 2.3412 iter time (s): 63.304 samples/sec: 16.176 %comms: 0.002877236796916051 %optimizer_step 0.05622240091718453 %forward: 23.006566849040684 %backward: 61.64630337271434 [2025-04-10 13:28:59,718] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23624.74 | forward: 145640.19 | backward_microstep: 390256.91 | backward: 390244.21 | backward_inner_microstep: 390226.63 | backward_inner: 390219.72 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.21 | step: 355.91 | _step_clipping: 0.12 | _step_step: 354.14 | _step_zero_grad: 0.52 | _step_check_overflow: 0.49 samples/sec: 16.176 | iteration 22750/ 143000 | elapsed time per iteration (ms): 63304.3 | learning rate: 5.678E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.341132E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 13:39:32,341] [INFO] [logging.py:60:log_dist] [Rank 0] step=22760, skipped=26, lr=[0.0005677374338841726, 0.0005677374338841726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22760 loss: 2.3578 iter time (s): 63.262 samples/sec: 16.187 %comms: 0.002904394328984518 %optimizer_step 0.05783262851454778 %forward: 23.04576305937723 %backward: 61.70636208593445 [2025-04-10 13:39:32,342] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22882.79 | forward: 145791.70 | backward_microstep: 390380.80 | backward: 390365.69 | backward_inner_microstep: 390347.26 | backward_inner: 390340.24 | backward_allreduce_microstep: 8.65 | backward_allreduce: 2.96 | reduce_tied_grads: 0.37 | comms: 18.37 | reduce_grads: 0.22 | step: 365.86 | _step_clipping: 0.13 | _step_step: 363.93 | _step_zero_grad: 0.53 | _step_check_overflow: 0.61 samples/sec: 16.187 | iteration 22760/ 143000 | elapsed time per iteration (ms): 63262.4 | learning rate: 5.677E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.351468E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 13:50:02,833] [INFO] [logging.py:60:log_dist] [Rank 0] step=22770, skipped=26, lr=[0.0005677076945380928, 0.0005677076945380928], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22770 loss: 2.3415 iter time (s): 63.049 samples/sec: 16.241 %comms: 0.0028805650477069884 %optimizer_step 0.05728450288203889 %forward: 23.07489002674811 %backward: 61.85919117753206 [2025-04-10 13:50:02,834] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21497.59 | forward: 145483.76 | backward_microstep: 390022.63 | backward: 390013.02 | backward_inner_microstep: 389996.12 | backward_inner: 389989.76 | backward_allreduce_microstep: 8.17 | backward_allreduce: 2.74 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.21 | step: 361.17 | _step_clipping: 0.13 | _step_step: 359.31 | _step_zero_grad: 0.56 | _step_check_overflow: 0.55 samples/sec: 16.241 | iteration 22770/ 143000 | elapsed time per iteration (ms): 63049.1 | learning rate: 5.677E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.338830E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 14:00:36,032] [INFO] [logging.py:60:log_dist] [Rank 0] step=22780, skipped=26, lr=[0.0005676779422712394, 0.0005676779422712394], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22780 loss: 2.3263 iter time (s): 63.319 samples/sec: 16.172 %comms: 0.0028720829567565873 %optimizer_step 0.056040390505850364 %forward: 22.96596476191616 %backward: 61.59288719492822 [2025-04-10 14:00:36,033] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24269.90 | forward: 145419.09 | backward_microstep: 390012.33 | backward: 390002.40 | backward_inner_microstep: 389983.80 | backward_inner: 389977.38 | backward_allreduce_microstep: 9.80 | backward_allreduce: 2.80 | reduce_tied_grads: 0.38 | comms: 18.19 | reduce_grads: 0.22 | step: 354.84 | _step_clipping: 0.14 | _step_step: 352.90 | _step_zero_grad: 0.52 | _step_check_overflow: 0.68 samples/sec: 16.172 | iteration 22780/ 143000 | elapsed time per iteration (ms): 63320.0 | learning rate: 5.677E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.344809E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 14:11:00,851] [INFO] [logging.py:60:log_dist] [Rank 0] step=22790, skipped=26, lr=[0.000567648177085048, 0.000567648177085048], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22790 loss: 2.3293 iter time (s): 62.481 samples/sec: 16.389 %comms: 0.0028677192879875612 %optimizer_step 0.05614495482774897 %forward: 23.245014636026013 %backward: 62.41259571057483 [2025-04-10 14:11:00,851] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16138.10 | forward: 145237.78 | backward_microstep: 389970.89 | backward: 389961.76 | backward_inner_microstep: 389944.57 | backward_inner: 389938.15 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.86 | reduce_tied_grads: 0.31 | comms: 17.92 | reduce_grads: 0.19 | step: 350.80 | _step_clipping: 0.11 | _step_step: 349.11 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.389 | iteration 22790/ 143000 | elapsed time per iteration (ms): 62481.8 | learning rate: 5.676E-04 | approx flops per GPU: 70.7TFLOPS | lm_loss: 2.335909E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 14:21:31,165] [INFO] [logging.py:60:log_dist] [Rank 0] step=22800, skipped=26, lr=[0.0005676183989809557, 0.0005676183989809557], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22800 loss: 2.3458 iter time (s): 63.031 samples/sec: 16.246 %comms: 0.0028882546257135732 %optimizer_step 0.05654797595532356 %forward: 23.074021699699806 %backward: 61.88640037206902 [2025-04-10 14:21:31,166] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21278.37 | forward: 145437.62 | backward_microstep: 390086.24 | backward: 390075.52 | backward_inner_microstep: 390058.19 | backward_inner: 390051.58 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.21 | step: 356.43 | _step_clipping: 0.13 | _step_step: 354.64 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.246 | iteration 22800/ 143000 | elapsed time per iteration (ms): 63031.4 | learning rate: 5.676E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.342499E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 14:32:02,067] [INFO] [logging.py:60:log_dist] [Rank 0] step=22810, skipped=26, lr=[0.0005675886079603991, 0.0005675886079603991], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22810 loss: 2.3476 iter time (s): 63.090 samples/sec: 16.231 %comms: 0.0028397641065379302 %optimizer_step 0.05529466075998857 %forward: 23.03087576785462 %backward: 61.82132267124989 [2025-04-10 14:32:02,068] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22059.77 | forward: 145300.92 | backward_microstep: 390038.64 | backward: 390028.38 | backward_inner_microstep: 390011.77 | backward_inner: 390005.35 | backward_allreduce_microstep: 7.96 | backward_allreduce: 2.72 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 348.85 | _step_clipping: 0.10 | _step_step: 347.03 | _step_zero_grad: 0.50 | _step_check_overflow: 0.65 samples/sec: 16.231 | iteration 22810/ 143000 | elapsed time per iteration (ms): 63090.2 | learning rate: 5.676E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.348168E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 14:42:32,938] [INFO] [logging.py:60:log_dist] [Rank 0] step=22820, skipped=26, lr=[0.0005675588040248166, 0.0005675588040248166], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22820 loss: 2.3426 iter time (s): 63.086 samples/sec: 16.232 %comms: 0.0028926655056637596 %optimizer_step 0.056981251261898916 %forward: 23.044556351378176 %backward: 61.85071802557531 [2025-04-10 14:42:32,939] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21748.80 | forward: 145379.90 | backward_microstep: 390206.65 | backward: 390194.15 | backward_inner_microstep: 390176.24 | backward_inner: 390169.47 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.86 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.21 | step: 359.47 | _step_clipping: 0.13 | _step_step: 357.62 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.232 | iteration 22820/ 143000 | elapsed time per iteration (ms): 63087.2 | learning rate: 5.676E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.345609E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 14:53:10,178] [INFO] [logging.py:60:log_dist] [Rank 0] step=22830, skipped=26, lr=[0.0005675289871756463, 0.0005675289871756463], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22830 loss: 2.3491 iter time (s): 63.723 samples/sec: 16.069 %comms: 0.0028862775936858375 %optimizer_step 0.05647986884085341 %forward: 22.84832353907285 %backward: 61.234772153021524 [2025-04-10 14:53:10,179] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27866.61 | forward: 145597.15 | backward_microstep: 390222.18 | backward: 390208.41 | backward_inner_microstep: 390188.76 | backward_inner: 390181.48 | backward_allreduce_microstep: 9.70 | backward_allreduce: 3.19 | reduce_tied_grads: 0.34 | comms: 18.39 | reduce_grads: 0.22 | step: 359.91 | _step_clipping: 0.12 | _step_step: 358.02 | _step_zero_grad: 0.58 | _step_check_overflow: 0.55 samples/sec: 16.069 | iteration 22830/ 143000 | elapsed time per iteration (ms): 63724.0 | learning rate: 5.675E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.339705E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 15:03:43,547] [INFO] [logging.py:60:log_dist] [Rank 0] step=22840, skipped=26, lr=[0.0005674991574143275, 0.0005674991574143275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22840 loss: 2.3319 iter time (s): 63.336 samples/sec: 16.168 %comms: 0.0028377382107927737 %optimizer_step 0.05612879421244269 %forward: 22.950062194424348 %backward: 61.59201861948057 [2025-04-10 15:03:43,548] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24403.82 | forward: 145357.21 | backward_microstep: 390114.14 | backward: 390101.07 | backward_inner_microstep: 390083.57 | backward_inner: 390076.93 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.89 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.21 | step: 355.50 | _step_clipping: 0.13 | _step_step: 353.69 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.168 | iteration 22840/ 143000 | elapsed time per iteration (ms): 63336.9 | learning rate: 5.675E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.354039E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 15:14:20,622] [INFO] [logging.py:60:log_dist] [Rank 0] step=22850, skipped=26, lr=[0.0005674693147422998, 0.0005674693147422998], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22850 loss: 2.3239 iter time (s): 63.707 samples/sec: 16.074 %comms: 0.0028791252007787705 %optimizer_step 0.0569582294942554 %forward: 22.857352140167958 %backward: 61.25961567358126 [2025-04-10 15:14:20,623] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27597.00 | forward: 145617.14 | backward_microstep: 390280.47 | backward: 390266.11 | backward_inner_microstep: 390246.18 | backward_inner: 390238.72 | backward_allreduce_microstep: 9.66 | backward_allreduce: 3.34 | reduce_tied_grads: 0.36 | comms: 18.34 | reduce_grads: 0.23 | step: 362.86 | _step_clipping: 0.13 | _step_step: 360.90 | _step_zero_grad: 0.53 | _step_check_overflow: 0.65 samples/sec: 16.073 | iteration 22850/ 143000 | elapsed time per iteration (ms): 63707.5 | learning rate: 5.675E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.341766E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 15:24:58,354] [INFO] [logging.py:60:log_dist] [Rank 0] step=22860, skipped=26, lr=[0.0005674394591610038, 0.0005674394591610038], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22860 loss: 2.3466 iter time (s): 63.773 samples/sec: 16.057 %comms: 0.002859151978349586 %optimizer_step 0.05677538506731307 %forward: 22.815661583861456 %backward: 61.177078680194526 [2025-04-10 15:24:58,355] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28534.31 | forward: 145501.26 | backward_microstep: 390155.29 | backward: 390141.75 | backward_inner_microstep: 390123.60 | backward_inner: 390116.70 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.02 | reduce_tied_grads: 0.35 | comms: 18.23 | reduce_grads: 0.22 | step: 362.07 | _step_clipping: 0.14 | _step_step: 360.18 | _step_zero_grad: 0.57 | _step_check_overflow: 0.52 samples/sec: 16.057 | iteration 22860/ 143000 | elapsed time per iteration (ms): 63773.2 | learning rate: 5.674E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.341759E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 15:35:39,588] [INFO] [logging.py:60:log_dist] [Rank 0] step=22870, skipped=26, lr=[0.0005674095906718801, 0.0005674095906718801], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22870 loss: 2.3547 iter time (s): 64.123 samples/sec: 15.969 %comms: 0.002828254117041788 %optimizer_step 0.05609529306877173 %forward: 22.7105858214554 %backward: 60.872602671247236 [2025-04-10 15:35:39,588] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31708.63 | forward: 145626.56 | backward_microstep: 390345.75 | backward: 390331.96 | backward_inner_microstep: 390312.50 | backward_inner: 390305.20 | backward_allreduce_microstep: 9.23 | backward_allreduce: 3.18 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.21 | step: 359.70 | _step_clipping: 0.13 | _step_step: 357.92 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 15.969 | iteration 22870/ 143000 | elapsed time per iteration (ms): 64123.4 | learning rate: 5.674E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.342554E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 15:46:10,581] [INFO] [logging.py:60:log_dist] [Rank 0] step=22880, skipped=26, lr=[0.0005673797092763704, 0.0005673797092763704], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22880 loss: 2.3674 iter time (s): 63.099 samples/sec: 16.229 %comms: 0.002961514314396387 %optimizer_step 0.060025660963449495 %forward: 23.05474942610874 %backward: 61.839346694672926 [2025-04-10 15:46:10,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21736.69 | forward: 145472.48 | backward_microstep: 390212.16 | backward: 390198.26 | backward_inner_microstep: 390179.00 | backward_inner: 390171.79 | backward_allreduce_microstep: 9.32 | backward_allreduce: 3.17 | reduce_tied_grads: 0.35 | comms: 18.69 | reduce_grads: 0.26 | step: 378.75 | _step_clipping: 0.14 | _step_step: 376.48 | _step_zero_grad: 0.64 | _step_check_overflow: 0.79 samples/sec: 16.228 | iteration 22880/ 143000 | elapsed time per iteration (ms): 63099.3 | learning rate: 5.674E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.346000E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 15:56:50,304] [INFO] [logging.py:60:log_dist] [Rank 0] step=22890, skipped=26, lr=[0.0005673498149759171, 0.0005673498149759171], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22890 loss: 2.3283 iter time (s): 63.972 samples/sec: 16.007 %comms: 0.0029026183499623265 %optimizer_step 0.05948425879716585 %forward: 22.80630962069755 %backward: 61.06113981418101 [2025-04-10 15:56:50,305] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29542.60 | forward: 145895.63 | backward_microstep: 390636.53 | backward: 390617.92 | backward_inner_microstep: 390596.27 | backward_inner: 390588.43 | backward_allreduce_microstep: 10.98 | backward_allreduce: 3.25 | reduce_tied_grads: 0.39 | comms: 18.57 | reduce_grads: 0.23 | step: 380.53 | _step_clipping: 0.13 | _step_step: 378.26 | _step_zero_grad: 0.66 | _step_check_overflow: 0.74 samples/sec: 16.007 | iteration 22890/ 143000 | elapsed time per iteration (ms): 63972.3 | learning rate: 5.673E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.342133E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 16:07:27,841] [INFO] [logging.py:60:log_dist] [Rank 0] step=22900, skipped=26, lr=[0.0005673199077719629, 0.0005673199077719629], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22900 loss: 2.3372 iter time (s): 63.753 samples/sec: 16.062 %comms: 0.002836388292684416 %optimizer_step 0.055396838001511566 %forward: 22.84565870685785 %backward: 61.23103823332458 [2025-04-10 16:07:27,842] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27932.51 | forward: 145648.18 | backward_microstep: 390382.00 | backward: 390366.90 | backward_inner_microstep: 390347.57 | backward_inner: 390340.29 | backward_allreduce_microstep: 9.15 | backward_allreduce: 3.15 | reduce_tied_grads: 0.29 | comms: 18.08 | reduce_grads: 0.20 | step: 353.17 | _step_clipping: 0.12 | _step_step: 351.52 | _step_zero_grad: 0.52 | _step_check_overflow: 0.44 samples/sec: 16.062 | iteration 22900/ 143000 | elapsed time per iteration (ms): 63753.7 | learning rate: 5.673E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.336322E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 16:18:05,101] [INFO] [logging.py:60:log_dist] [Rank 0] step=22910, skipped=26, lr=[0.0005672899876659512, 0.0005672899876659512], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22910 loss: 2.3296 iter time (s): 63.725 samples/sec: 16.069 %comms: 0.003127310740042447 %optimizer_step 0.05950656364189372 %forward: 22.881948399923516 %backward: 61.255672826374365 [2025-04-10 16:18:05,102] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27461.46 | forward: 145816.27 | backward_microstep: 390368.52 | backward: 390354.61 | backward_inner_microstep: 390335.92 | backward_inner: 390328.81 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.09 | reduce_tied_grads: 2.04 | comms: 19.93 | reduce_grads: 0.22 | step: 379.21 | _step_clipping: 0.14 | _step_step: 377.38 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.069 | iteration 22910/ 143000 | elapsed time per iteration (ms): 63726.0 | learning rate: 5.673E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.341581E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 16:28:43,349] [INFO] [logging.py:60:log_dist] [Rank 0] step=22920, skipped=26, lr=[0.0005672600546593261, 0.0005672600546593261], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22920 loss: 2.3607 iter time (s): 63.824 samples/sec: 16.044 %comms: 0.0028439121564182964 %optimizer_step 0.05618137613229358 %forward: 22.824227193487292 %backward: 61.11986371617023 [2025-04-10 16:28:43,350] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28906.99 | forward: 145673.83 | backward_microstep: 390104.13 | backward: 390092.71 | backward_inner_microstep: 390073.35 | backward_inner: 390066.76 | backward_allreduce_microstep: 10.17 | backward_allreduce: 2.92 | reduce_tied_grads: 0.33 | comms: 18.15 | reduce_grads: 0.22 | step: 358.57 | _step_clipping: 0.12 | _step_step: 356.71 | _step_zero_grad: 0.53 | _step_check_overflow: 0.57 samples/sec: 16.044 | iteration 22920/ 143000 | elapsed time per iteration (ms): 63824.8 | learning rate: 5.673E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.337631E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 16:39:15,479] [INFO] [logging.py:60:log_dist] [Rank 0] step=22930, skipped=26, lr=[0.0005672301087535324, 0.0005672301087535324], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22930 loss: 2.3403 iter time (s): 63.212 samples/sec: 16.199 %comms: 0.003005145690795257 %optimizer_step 0.05888763193299041 %forward: 22.978458340527048 %backward: 61.691872304146585 [2025-04-10 16:39:15,479] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23363.01 | forward: 145252.29 | backward_microstep: 389979.07 | backward: 389968.96 | backward_inner_microstep: 389950.00 | backward_inner: 389942.93 | backward_allreduce_microstep: 9.43 | backward_allreduce: 3.17 | reduce_tied_grads: 0.32 | comms: 19.00 | reduce_grads: 0.25 | step: 372.24 | _step_clipping: 0.14 | _step_step: 370.32 | _step_zero_grad: 0.59 | _step_check_overflow: 0.54 samples/sec: 16.199 | iteration 22930/ 143000 | elapsed time per iteration (ms): 63213.0 | learning rate: 5.672E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.338296E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 16:49:46,444] [INFO] [logging.py:60:log_dist] [Rank 0] step=22940, skipped=26, lr=[0.0005672001499500152, 0.0005672001499500152], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22940 loss: 2.3338 iter time (s): 63.096 samples/sec: 16.229 %comms: 0.0029117628119382207 %optimizer_step 0.05879959968910438 %forward: 23.040013913644987 %backward: 61.81891676487115 [2025-04-10 16:49:46,445] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21964.57 | forward: 145373.27 | backward_microstep: 390065.66 | backward: 390052.64 | backward_inner_microstep: 390032.90 | backward_inner: 390026.17 | backward_allreduce_microstep: 10.44 | backward_allreduce: 4.70 | reduce_tied_grads: 0.37 | comms: 18.37 | reduce_grads: 0.24 | step: 371.00 | _step_clipping: 0.13 | _step_step: 368.72 | _step_zero_grad: 0.59 | _step_check_overflow: 0.87 samples/sec: 16.229 | iteration 22940/ 143000 | elapsed time per iteration (ms): 63096.6 | learning rate: 5.672E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.340788E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:00:26,169] [INFO] [logging.py:60:log_dist] [Rank 0] step=22950, skipped=26, lr=[0.0005671701782502208, 0.0005671701782502208], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22950 loss: 2.3469 iter time (s): 63.972 samples/sec: 16.007 %comms: 0.002854832177700933 %optimizer_step 0.05806285144183714 %forward: 22.789931324891338 %backward: 61.026259260314816 [2025-04-10 17:00:26,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29928.95 | forward: 145791.20 | backward_microstep: 390413.37 | backward: 390395.71 | backward_inner_microstep: 390375.39 | backward_inner: 390367.28 | backward_allreduce_microstep: 9.69 | backward_allreduce: 3.32 | reduce_tied_grads: 0.34 | comms: 18.26 | reduce_grads: 0.24 | step: 371.44 | _step_clipping: 0.13 | _step_step: 369.52 | _step_zero_grad: 0.60 | _step_check_overflow: 0.55 samples/sec: 16.007 | iteration 22950/ 143000 | elapsed time per iteration (ms): 63972.4 | learning rate: 5.672E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.341165E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:10:53,403] [INFO] [logging.py:60:log_dist] [Rank 0] step=22960, skipped=26, lr=[0.0005671401936555954, 0.0005671401936555954], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22960 loss: 2.3472 iter time (s): 62.723 samples/sec: 16.326 %comms: 0.002884236511944716 %optimizer_step 0.0576343269913807 %forward: 23.154690896046496 %backward: 62.180433342467865 [2025-04-10 17:10:53,403] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18442.90 | forward: 145232.59 | backward_microstep: 390022.59 | backward: 390012.79 | backward_inner_microstep: 389993.95 | backward_inner: 389987.03 | backward_allreduce_microstep: 9.16 | backward_allreduce: 3.14 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.22 | step: 361.50 | _step_clipping: 0.12 | _step_step: 359.68 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.326 | iteration 22960/ 143000 | elapsed time per iteration (ms): 62723.3 | learning rate: 5.671E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.344011E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:21:21,762] [INFO] [logging.py:60:log_dist] [Rank 0] step=22970, skipped=26, lr=[0.0005671101961675865, 0.0005671101961675865], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22970 loss: 2.3507 iter time (s): 62.835 samples/sec: 16.297 %comms: 0.002871325178869237 %optimizer_step 0.05804615645018271 %forward: 23.104560812739578 %backward: 62.05915799302426 [2025-04-10 17:21:21,763] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19672.72 | forward: 145178.44 | backward_microstep: 389960.02 | backward: 389951.22 | backward_inner_microstep: 389932.83 | backward_inner: 389926.14 | backward_allreduce_microstep: 8.99 | backward_allreduce: 3.09 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.20 | step: 364.74 | _step_clipping: 0.12 | _step_step: 362.86 | _step_zero_grad: 0.60 | _step_check_overflow: 0.56 samples/sec: 16.296 | iteration 22970/ 143000 | elapsed time per iteration (ms): 62836.0 | learning rate: 5.671E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.350047E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:31:59,001] [INFO] [logging.py:60:log_dist] [Rank 0] step=22980, skipped=26, lr=[0.0005670801857876416, 0.0005670801857876416], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22980 loss: 2.3314 iter time (s): 63.723 samples/sec: 16.069 %comms: 0.002816386762221204 %optimizer_step 0.05799302330021555 %forward: 22.827704876834364 %backward: 61.219206701877305 [2025-04-10 17:31:59,002] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28069.62 | forward: 145465.77 | backward_microstep: 390120.42 | backward: 390109.26 | backward_inner_microstep: 390091.07 | backward_inner: 390082.49 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.03 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.21 | step: 369.55 | _step_clipping: 0.12 | _step_step: 367.86 | _step_zero_grad: 0.51 | _step_check_overflow: 0.46 samples/sec: 16.069 | iteration 22980/ 143000 | elapsed time per iteration (ms): 63723.9 | learning rate: 5.671E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.335881E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:42:30,804] [INFO] [logging.py:60:log_dist] [Rank 0] step=22990, skipped=26, lr=[0.0005670501625172095, 0.0005670501625172095], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 22990 loss: 2.3493 iter time (s): 63.180 samples/sec: 16.208 %comms: 0.002884284040912659 %optimizer_step 0.05739390140381503 %forward: 23.002398627168045 %backward: 61.74859501108032 [2025-04-10 17:42:30,804] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22767.97 | forward: 145328.38 | backward_microstep: 390138.43 | backward: 390125.53 | backward_inner_microstep: 390106.95 | backward_inner: 390100.02 | backward_allreduce_microstep: 9.01 | backward_allreduce: 3.04 | reduce_tied_grads: 0.35 | comms: 18.22 | reduce_grads: 0.23 | step: 362.61 | _step_clipping: 0.13 | _step_step: 360.78 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.208 | iteration 22990/ 143000 | elapsed time per iteration (ms): 63180.2 | learning rate: 5.671E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.330427E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:53:04,566] [INFO] [logging.py:60:log_dist] [Rank 0] step=23000, skipped=26, lr=[0.0005670201263577389, 0.0005670201263577389], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23000 loss: 2.3334 iter time (s): 63.376 samples/sec: 16.158 %comms: 0.002880406135414194 %optimizer_step 0.05714246344511018 %forward: 22.954798539592332 %backward: 61.557463714540226 [2025-04-10 17:53:04,567] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24569.07 | forward: 145477.49 | backward_microstep: 390135.80 | backward: 390124.34 | backward_inner_microstep: 390105.66 | backward_inner: 390098.54 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.08 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.22 | step: 362.14 | _step_clipping: 0.13 | _step_step: 360.29 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 16.157 | iteration 23000/ 143000 | elapsed time per iteration (ms): 63376.2 | learning rate: 5.670E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.346558E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 17:53:07,524] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step23000/mp_rank_00_model_states.pt [2025-04-10 17:53:21,621] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-10 17:53:21,626] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step23000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-10 18:04:03,913] [INFO] [logging.py:60:log_dist] [Rank 0] step=23010, skipped=26, lr=[0.0005669900773106798, 0.0005669900773106798], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23010 loss: 2.3453 iter time (s): 64.227 samples/sec: 15.943 %comms: 0.0028144542834913967 %optimizer_step 0.057646718903494494 %forward: 22.665288822678082 %backward: 60.72666203525888 [2025-04-10 18:04:03,914] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33065.70 | forward: 145572.55 | backward_microstep: 390040.48 | backward: 390029.66 | backward_inner_microstep: 390007.74 | backward_inner: 390000.51 | backward_allreduce_microstep: 10.07 | backward_allreduce: 3.61 | reduce_tied_grads: 0.31 | comms: 18.08 | reduce_grads: 0.23 | step: 370.25 | _step_clipping: 0.12 | _step_step: 368.18 | _step_zero_grad: 0.59 | _step_check_overflow: 0.72 samples/sec: 15.531 | iteration 23010/ 143000 | elapsed time per iteration (ms): 65934.7 | learning rate: 5.670E-04 | approx flops per GPU: 67.0TFLOPS | lm_loss: 2.341489E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 18:14:42,475] [INFO] [logging.py:60:log_dist] [Rank 0] step=23020, skipped=26, lr=[0.0005669600153774823, 0.0005669600153774823], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23020 loss: 2.3134 iter time (s): 63.856 samples/sec: 16.036 %comms: 0.002881980665339527 %optimizer_step 0.05632112619434904 %forward: 22.778655072212175 %backward: 61.08795286862855 [2025-04-10 18:14:42,476] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29423.90 | forward: 145454.41 | backward_microstep: 390092.92 | backward: 390080.64 | backward_inner_microstep: 390060.72 | backward_inner: 390053.72 | backward_allreduce_microstep: 10.34 | backward_allreduce: 3.00 | reduce_tied_grads: 0.31 | comms: 18.40 | reduce_grads: 0.38 | step: 359.64 | _step_clipping: 0.13 | _step_step: 357.76 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.036 | iteration 23020/ 143000 | elapsed time per iteration (ms): 63856.2 | learning rate: 5.670E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.329409E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 18:25:09,974] [INFO] [logging.py:60:log_dist] [Rank 0] step=23030, skipped=26, lr=[0.0005669299405595974, 0.0005669299405595974], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23030 loss: 2.3467 iter time (s): 62.749 samples/sec: 16.319 %comms: 0.0028741211311436323 %optimizer_step 0.0563774795128469 %forward: 23.156859685599773 %backward: 62.171268106899014 [2025-04-10 18:25:09,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18479.01 | forward: 145307.88 | backward_microstep: 390133.87 | backward: 390120.92 | backward_inner_microstep: 390104.14 | backward_inner: 390097.59 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.75 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.20 | step: 353.77 | _step_clipping: 0.12 | _step_step: 352.04 | _step_zero_grad: 0.49 | _step_check_overflow: 0.50 samples/sec: 16.319 | iteration 23030/ 143000 | elapsed time per iteration (ms): 62750.0 | learning rate: 5.669E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.343687E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 18:35:32,820] [INFO] [logging.py:60:log_dist] [Rank 0] step=23040, skipped=26, lr=[0.0005668998528584766, 0.0005668998528584766], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23040 loss: 2.3479 iter time (s): 62.284 samples/sec: 16.441 %comms: 0.0028875973400077255 %optimizer_step 0.05614188208592285 %forward: 23.30439171224737 %backward: 62.60734294770048 [2025-04-10 18:35:32,820] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14217.41 | forward: 145149.03 | backward_microstep: 389953.11 | backward: 389943.45 | backward_inner_microstep: 389927.58 | backward_inner: 389921.42 | backward_allreduce_microstep: 7.57 | backward_allreduce: 2.60 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.20 | step: 349.67 | _step_clipping: 0.11 | _step_step: 348.02 | _step_zero_grad: 0.47 | _step_check_overflow: 0.51 samples/sec: 16.441 | iteration 23040/ 143000 | elapsed time per iteration (ms): 62284.5 | learning rate: 5.669E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.335713E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 18:45:55,535] [INFO] [logging.py:60:log_dist] [Rank 0] step=23050, skipped=26, lr=[0.000566869752275572, 0.000566869752275572], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23050 loss: 2.3366 iter time (s): 62.271 samples/sec: 16.444 %comms: 0.0029249591378384242 %optimizer_step 0.05592053234858185 %forward: 23.313499621703563 %backward: 62.626304043851555 [2025-04-10 18:45:55,535] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14001.36 | forward: 145175.29 | backward_microstep: 389989.62 | backward: 389979.71 | backward_inner_microstep: 389962.01 | backward_inner: 389955.79 | backward_allreduce_microstep: 9.24 | backward_allreduce: 2.58 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.19 | step: 348.22 | _step_clipping: 0.10 | _step_step: 346.54 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.444 | iteration 23050/ 143000 | elapsed time per iteration (ms): 62271.5 | learning rate: 5.669E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.340965E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 18:56:19,260] [INFO] [logging.py:60:log_dist] [Rank 0] step=23060, skipped=26, lr=[0.0005668396388123366, 0.0005668396388123366], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23060 loss: 2.3293 iter time (s): 62.372 samples/sec: 16.418 %comms: 0.002928895367853584 %optimizer_step 0.05693964944582597 %forward: 23.281225265453866 %backward: 62.52212921558936 [2025-04-10 18:56:19,261] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14984.68 | forward: 145209.68 | backward_microstep: 389973.28 | backward: 389963.07 | backward_inner_microstep: 389946.85 | backward_inner: 389940.38 | backward_allreduce_microstep: 7.66 | backward_allreduce: 2.64 | reduce_tied_grads: 0.27 | comms: 18.27 | reduce_grads: 0.20 | step: 355.14 | _step_clipping: 0.11 | _step_step: 353.44 | _step_zero_grad: 0.53 | _step_check_overflow: 0.45 samples/sec: 16.417 | iteration 23060/ 143000 | elapsed time per iteration (ms): 62372.6 | learning rate: 5.668E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.336021E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 19:06:42,337] [INFO] [logging.py:60:log_dist] [Rank 0] step=23070, skipped=26, lr=[0.0005668095124702235, 0.0005668095124702235], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23070 loss: 2.3512 iter time (s): 62.307 samples/sec: 16.435 %comms: 0.0028755782842399867 %optimizer_step 0.05595900060336481 %forward: 23.29862506249898 %backward: 62.58608604406063 [2025-04-10 19:06:42,338] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 14403.02 | forward: 145167.17 | backward_microstep: 389965.80 | backward: 389956.26 | backward_inner_microstep: 389940.25 | backward_inner: 389933.78 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.60 | reduce_tied_grads: 0.28 | comms: 17.92 | reduce_grads: 0.19 | step: 348.66 | _step_clipping: 0.10 | _step_step: 347.02 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 16.435 | iteration 23070/ 143000 | elapsed time per iteration (ms): 62307.7 | learning rate: 5.668E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.339971E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 19:17:15,740] [INFO] [logging.py:60:log_dist] [Rank 0] step=23080, skipped=26, lr=[0.0005667793732506872, 0.0005667793732506872], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23080 loss: 2.3420 iter time (s): 63.340 samples/sec: 16.167 %comms: 0.0031172222608068013 %optimizer_step 0.06028972244521974 %forward: 22.970553223188226 %backward: 61.58269488881272 [2025-04-10 19:17:15,741] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24247.08 | forward: 145494.82 | backward_microstep: 390074.63 | backward: 390063.02 | backward_inner_microstep: 390044.61 | backward_inner: 390037.40 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.04 | reduce_tied_grads: 0.27 | comms: 19.74 | reduce_grads: 0.20 | step: 381.87 | _step_clipping: 0.11 | _step_step: 380.19 | _step_zero_grad: 0.52 | _step_check_overflow: 0.47 samples/sec: 16.167 | iteration 23080/ 143000 | elapsed time per iteration (ms): 63340.3 | learning rate: 5.668E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.330558E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 19:27:58,567] [INFO] [logging.py:60:log_dist] [Rank 0] step=23090, skipped=26, lr=[0.0005667492211551818, 0.0005667492211551818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23090 loss: 2.3398 iter time (s): 64.282 samples/sec: 15.930 %comms: 0.0030703715590319736 %optimizer_step 0.05599300972572524 %forward: 22.63155020445213 %backward: 60.69835111715588 [2025-04-10 19:27:58,568] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33581.32 | forward: 145480.45 | backward_microstep: 390194.41 | backward: 390181.99 | backward_inner_microstep: 390163.47 | backward_inner: 390156.20 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 19.74 | reduce_grads: 0.22 | step: 359.94 | _step_clipping: 0.13 | _step_step: 358.21 | _step_zero_grad: 0.51 | _step_check_overflow: 0.50 samples/sec: 15.930 | iteration 23090/ 143000 | elapsed time per iteration (ms): 64282.7 | learning rate: 5.667E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.335709E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 19:38:38,617] [INFO] [logging.py:60:log_dist] [Rank 0] step=23100, skipped=26, lr=[0.000566719056185163, 0.000566719056185163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23100 loss: 2.3470 iter time (s): 64.004 samples/sec: 15.999 %comms: 0.0029251595338769553 %optimizer_step 0.059603492004619694 %forward: 22.752149670041728 %backward: 60.978254479426795 [2025-04-10 19:38:38,618] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30508.68 | forward: 145623.67 | backward_microstep: 390301.70 | backward: 390287.40 | backward_inner_microstep: 390268.75 | backward_inner: 390261.63 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.07 | reduce_tied_grads: 0.36 | comms: 18.72 | reduce_grads: 0.23 | step: 381.49 | _step_clipping: 0.13 | _step_step: 379.16 | _step_zero_grad: 0.68 | _step_check_overflow: 0.80 samples/sec: 15.999 | iteration 23100/ 143000 | elapsed time per iteration (ms): 64005.0 | learning rate: 5.667E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.336192E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 19:49:38,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=23110, skipped=26, lr=[0.0005666888783420864, 0.0005666888783420864], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23110 loss: 2.3474 iter time (s): 66.033 samples/sec: 15.507 %comms: 0.002746985027773316 %optimizer_step 0.055688792948721054 %forward: 22.12048939094621 %backward: 59.13449198365511 [2025-04-10 19:49:38,954] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 50126.34 | forward: 146067.89 | backward_microstep: 390499.57 | backward: 390481.88 | backward_inner_microstep: 390461.97 | backward_inner: 390454.18 | backward_allreduce_microstep: 9.26 | backward_allreduce: 3.20 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.22 | step: 367.73 | _step_clipping: 0.13 | _step_step: 365.85 | _step_zero_grad: 0.60 | _step_check_overflow: 0.50 samples/sec: 15.507 | iteration 23110/ 143000 | elapsed time per iteration (ms): 66033.6 | learning rate: 5.667E-04 | approx flops per GPU: 66.9TFLOPS | lm_loss: 2.335526E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 20:00:14,099] [INFO] [logging.py:60:log_dist] [Rank 0] step=23120, skipped=26, lr=[0.0005666586876274088, 0.0005666586876274088], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23120 loss: 2.3417 iter time (s): 63.514 samples/sec: 16.122 %comms: 0.0028919634762201597 %optimizer_step 0.056900744961735826 %forward: 22.923265030916355 %backward: 61.44334022052528 [2025-04-10 20:00:14,099] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25710.16 | forward: 145594.74 | backward_microstep: 390264.36 | backward: 390251.00 | backward_inner_microstep: 390230.89 | backward_inner: 390223.53 | backward_allreduce_microstep: 10.39 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.37 | reduce_grads: 0.20 | step: 361.40 | _step_clipping: 0.12 | _step_step: 359.46 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 16.122 | iteration 23120/ 143000 | elapsed time per iteration (ms): 63514.5 | learning rate: 5.667E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.347165E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 20:10:40,667] [INFO] [logging.py:60:log_dist] [Rank 0] step=23130, skipped=26, lr=[0.0005666284840425872, 0.0005666284840425872], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23130 loss: 2.3332 iter time (s): 62.656 samples/sec: 16.343 %comms: 0.0028980669058499883 %optimizer_step 0.05688020897046795 %forward: 23.20610440493392 %backward: 62.264752517289466 [2025-04-10 20:10:40,668] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17454.57 | forward: 145400.72 | backward_microstep: 390139.42 | backward: 390127.52 | backward_inner_microstep: 390110.57 | backward_inner: 390103.91 | backward_allreduce_microstep: 7.98 | backward_allreduce: 2.73 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.21 | step: 356.39 | _step_clipping: 0.11 | _step_step: 354.66 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.343 | iteration 23130/ 143000 | elapsed time per iteration (ms): 62656.8 | learning rate: 5.666E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.343058E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 20:21:06,549] [INFO] [logging.py:60:log_dist] [Rank 0] step=23140, skipped=26, lr=[0.0005665982675890793, 0.0005665982675890793], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23140 loss: 2.3434 iter time (s): 62.588 samples/sec: 16.361 %comms: 0.0028881029494024547 %optimizer_step 0.0567196281395422 %forward: 23.21422946503167 %backward: 62.332387281022875 [2025-04-10 20:21:06,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16888.47 | forward: 145292.29 | backward_microstep: 390135.11 | backward: 390123.45 | backward_inner_microstep: 390106.32 | backward_inner: 390099.64 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.78 | reduce_tied_grads: 0.33 | comms: 18.08 | reduce_grads: 0.21 | step: 354.99 | _step_clipping: 0.11 | _step_step: 353.20 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 16.361 | iteration 23140/ 143000 | elapsed time per iteration (ms): 62588.2 | learning rate: 5.666E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.345816E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 20:31:50,307] [INFO] [logging.py:60:log_dist] [Rank 0] step=23150, skipped=26, lr=[0.0005665680382683436, 0.0005665680382683436], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23150 loss: 2.3229 iter time (s): 64.375 samples/sec: 15.907 %comms: 0.0029100131007920938 %optimizer_step 0.059923796559152705 %forward: 22.645355777511725 %backward: 60.623269111769176 [2025-04-10 20:31:50,307] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34045.33 | forward: 145779.90 | backward_microstep: 390278.03 | backward: 390263.41 | backward_inner_microstep: 390243.37 | backward_inner: 390234.11 | backward_allreduce_microstep: 9.66 | backward_allreduce: 3.32 | reduce_tied_grads: 0.36 | comms: 18.73 | reduce_grads: 0.24 | step: 385.76 | _step_clipping: 0.13 | _step_step: 383.83 | _step_zero_grad: 0.57 | _step_check_overflow: 0.55 samples/sec: 15.907 | iteration 23150/ 143000 | elapsed time per iteration (ms): 64375.8 | learning rate: 5.666E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.340405E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 20:42:27,036] [INFO] [logging.py:60:log_dist] [Rank 0] step=23160, skipped=26, lr=[0.000566537796081839, 0.000566537796081839], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23160 loss: 2.3421 iter time (s): 63.672 samples/sec: 16.082 %comms: 0.0028840221844029464 %optimizer_step 0.05894303887685008 %forward: 22.88419311909961 %backward: 61.29962035553649 [2025-04-10 20:42:27,037] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27085.84 | forward: 145708.96 | backward_microstep: 390325.52 | backward: 390308.88 | backward_inner_microstep: 390288.62 | backward_inner: 390279.61 | backward_allreduce_microstep: 10.56 | backward_allreduce: 4.80 | reduce_tied_grads: 0.37 | comms: 18.36 | reduce_grads: 0.22 | step: 375.30 | _step_clipping: 0.13 | _step_step: 373.08 | _step_zero_grad: 0.59 | _step_check_overflow: 0.83 samples/sec: 16.082 | iteration 23160/ 143000 | elapsed time per iteration (ms): 63672.9 | learning rate: 5.665E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.348956E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 20:52:59,722] [INFO] [logging.py:60:log_dist] [Rank 0] step=23170, skipped=26, lr=[0.0005665075410310252, 0.0005665075410310252], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23170 loss: 2.3244 iter time (s): 63.268 samples/sec: 16.185 %comms: 0.002896579831248074 %optimizer_step 0.05660723050788111 %forward: 23.026113238279557 %backward: 61.721065833345435 [2025-04-10 20:52:59,722] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22832.27 | forward: 145681.32 | backward_microstep: 390515.65 | backward: 390496.05 | backward_inner_microstep: 390477.11 | backward_inner: 390469.77 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.03 | reduce_tied_grads: 0.30 | comms: 18.33 | reduce_grads: 0.21 | step: 358.14 | _step_clipping: 0.13 | _step_step: 356.22 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 16.185 | iteration 23170/ 143000 | elapsed time per iteration (ms): 63268.5 | learning rate: 5.665E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.335973E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 21:03:33,616] [INFO] [logging.py:60:log_dist] [Rank 0] step=23180, skipped=26, lr=[0.0005664772731173624, 0.0005664772731173624], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23180 loss: 2.3310 iter time (s): 63.389 samples/sec: 16.154 %comms: 0.0028441512533312276 %optimizer_step 0.05674592898740542 %forward: 22.95436375052386 %backward: 61.56187108819188 [2025-04-10 21:03:33,616] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24541.69 | forward: 145504.98 | backward_microstep: 390246.74 | backward: 390233.37 | backward_inner_microstep: 390215.46 | backward_inner: 390208.56 | backward_allreduce_microstep: 8.51 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.22 | step: 359.71 | _step_clipping: 0.11 | _step_step: 357.93 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.154 | iteration 23180/ 143000 | elapsed time per iteration (ms): 63389.4 | learning rate: 5.665E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.330116E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 21:14:12,358] [INFO] [logging.py:60:log_dist] [Rank 0] step=23190, skipped=26, lr=[0.0005664469923423115, 0.0005664469923423115], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23190 loss: 2.3264 iter time (s): 63.874 samples/sec: 16.032 %comms: 0.0028480598143549203 %optimizer_step 0.05778559241637381 %forward: 22.799782548443968 %backward: 61.07979394561034 [2025-04-10 21:14:12,358] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29389.31 | forward: 145630.36 | backward_microstep: 390150.39 | backward: 390138.48 | backward_inner_microstep: 390118.53 | backward_inner: 390111.35 | backward_allreduce_microstep: 9.78 | backward_allreduce: 3.39 | reduce_tied_grads: 0.37 | comms: 18.19 | reduce_grads: 0.25 | step: 369.10 | _step_clipping: 0.15 | _step_step: 367.07 | _step_zero_grad: 0.60 | _step_check_overflow: 0.59 samples/sec: 16.032 | iteration 23190/ 143000 | elapsed time per iteration (ms): 63874.2 | learning rate: 5.664E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.331620E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 21:24:53,499] [INFO] [logging.py:60:log_dist] [Rank 0] step=23200, skipped=26, lr=[0.0005664166987073339, 0.0005664166987073339], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23200 loss: 2.3447 iter time (s): 64.113 samples/sec: 15.972 %comms: 0.002846885006002578 %optimizer_step 0.058307965268455116 %forward: 22.687278880657352 %backward: 60.84789087463014 [2025-04-10 21:24:53,500] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31936.06 | forward: 145456.06 | backward_microstep: 390131.41 | backward: 390117.05 | backward_inner_microstep: 390095.33 | backward_inner: 390087.99 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.25 | reduce_tied_grads: 0.38 | comms: 18.25 | reduce_grads: 0.25 | step: 373.83 | _step_clipping: 0.14 | _step_step: 371.70 | _step_zero_grad: 0.66 | _step_check_overflow: 0.65 samples/sec: 15.972 | iteration 23200/ 143000 | elapsed time per iteration (ms): 64114.1 | learning rate: 5.664E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.336592E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 21:35:33,308] [INFO] [logging.py:60:log_dist] [Rank 0] step=23210, skipped=26, lr=[0.0005663863922138919, 0.0005663863922138919], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23210 loss: 2.3181 iter time (s): 63.980 samples/sec: 16.005 %comms: 0.0028581390463081145 %optimizer_step 0.05775014174765812 %forward: 22.73369361002225 %backward: 60.9765024801607 [2025-04-10 21:35:33,310] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30639.63 | forward: 145450.91 | backward_microstep: 390142.03 | backward: 390129.65 | backward_inner_microstep: 390110.04 | backward_inner: 390102.72 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.23 | reduce_tied_grads: 0.34 | comms: 18.29 | reduce_grads: 0.27 | step: 369.49 | _step_clipping: 0.15 | _step_step: 367.28 | _step_zero_grad: 0.62 | _step_check_overflow: 0.75 samples/sec: 16.005 | iteration 23210/ 143000 | elapsed time per iteration (ms): 63981.0 | learning rate: 5.664E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.340087E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 21:46:28,924] [INFO] [logging.py:60:log_dist] [Rank 0] step=23220, skipped=26, lr=[0.000566356072863448, 0.000566356072863448], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23220 loss: 2.3508 iter time (s): 65.561 samples/sec: 15.619 %comms: 0.0027726504682830558 %optimizer_step 0.05479758553147482 %forward: 22.871381190152228 %backward: 59.54648770573496 [2025-04-10 21:46:28,925] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41650.37 | forward: 149946.85 | backward_microstep: 390406.89 | backward: 390392.17 | backward_inner_microstep: 390371.25 | backward_inner: 390363.22 | backward_allreduce_microstep: 10.12 | backward_allreduce: 3.48 | reduce_tied_grads: 0.35 | comms: 18.18 | reduce_grads: 0.21 | step: 359.26 | _step_clipping: 0.13 | _step_step: 357.31 | _step_zero_grad: 0.59 | _step_check_overflow: 0.60 samples/sec: 15.619 | iteration 23220/ 143000 | elapsed time per iteration (ms): 65561.5 | learning rate: 5.664E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.337097E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 21:57:05,326] [INFO] [logging.py:60:log_dist] [Rank 0] step=23230, skipped=26, lr=[0.0005663257406574656, 0.0005663257406574656], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23230 loss: 2.3370 iter time (s): 63.640 samples/sec: 16.091 %comms: 0.0028888379203121164 %optimizer_step 0.06138685047830972 %forward: 22.857985191265353 %backward: 61.301409097549886 [2025-04-10 21:57:05,328] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27170.22 | forward: 145467.36 | backward_microstep: 390131.46 | backward: 390119.88 | backward_inner_microstep: 390100.24 | backward_inner: 390093.12 | backward_allreduce_microstep: 9.65 | backward_allreduce: 3.31 | reduce_tied_grads: 0.40 | comms: 18.38 | reduce_grads: 0.25 | step: 390.66 | _step_clipping: 0.15 | _step_step: 388.20 | _step_zero_grad: 0.62 | _step_check_overflow: 0.98 samples/sec: 16.090 | iteration 23230/ 143000 | elapsed time per iteration (ms): 63640.3 | learning rate: 5.663E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.345089E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 22:07:42,710] [INFO] [logging.py:60:log_dist] [Rank 0] step=23240, skipped=26, lr=[0.0005662953955974088, 0.0005662953955974088], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23240 loss: 2.3399 iter time (s): 63.738 samples/sec: 16.066 %comms: 0.002857423241294093 %optimizer_step 0.05786157687960728 %forward: 22.876702875038426 %backward: 61.22240935087826 [2025-04-10 22:07:42,711] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27710.10 | forward: 145810.83 | backward_microstep: 390231.14 | backward: 390217.52 | backward_inner_microstep: 390198.40 | backward_inner: 390191.37 | backward_allreduce_microstep: 9.23 | backward_allreduce: 3.18 | reduce_tied_grads: 0.33 | comms: 18.21 | reduce_grads: 0.23 | step: 368.80 | _step_clipping: 0.12 | _step_step: 366.79 | _step_zero_grad: 0.59 | _step_check_overflow: 0.66 samples/sec: 16.066 | iteration 23240/ 143000 | elapsed time per iteration (ms): 63738.3 | learning rate: 5.663E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.343568E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 22:18:12,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=23250, skipped=26, lr=[0.000566265037684742, 0.000566265037684742], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23250 loss: 2.3460 iter time (s): 63.022 samples/sec: 16.248 %comms: 0.002928842223512673 %optimizer_step 0.0578202990901038 %forward: 23.10319964105595 %backward: 61.93870428321166 [2025-04-10 22:18:12,936] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20575.34 | forward: 145600.82 | backward_microstep: 390364.28 | backward: 390349.65 | backward_inner_microstep: 390331.24 | backward_inner: 390324.12 | backward_allreduce_microstep: 8.86 | backward_allreduce: 3.05 | reduce_tied_grads: 0.31 | comms: 18.46 | reduce_grads: 0.20 | step: 364.39 | _step_clipping: 0.11 | _step_step: 362.45 | _step_zero_grad: 0.51 | _step_check_overflow: 0.71 samples/sec: 16.248 | iteration 23250/ 143000 | elapsed time per iteration (ms): 63022.5 | learning rate: 5.663E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.335612E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 22:20:19,244] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-10 22:21:22,490] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-10 22:28:55,213] [INFO] [logging.py:60:log_dist] [Rank 0] step=23260, skipped=28, lr=[0.000566240742101714, 0.000566240742101714], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23260 loss: 2.3213 iter time (s): 64.227 samples/sec: 15.943 %comms: 0.0023438690902893478 %optimizer_step 0.04821756814693955 %forward: 22.755050984899075 %backward: 60.83253344375472 [2025-04-10 22:28:55,214] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31689.93 | forward: 146149.06 | backward_microstep: 390728.71 | backward: 390709.62 | backward_inner_microstep: 390689.39 | backward_inner: 390677.89 | backward_allreduce_microstep: 9.59 | backward_allreduce: 3.43 | reduce_tied_grads: 0.38 | comms: 15.05 | reduce_grads: 0.24 | step: 309.69 | _step_clipping: 0.14 | _step_step: 307.35 | _step_zero_grad: 0.70 | _step_check_overflow: 0.76 samples/sec: 15.943 | iteration 23260/ 143000 | elapsed time per iteration (ms): 64227.8 | learning rate: 5.662E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.332318E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-10 22:39:32,745] [INFO] [logging.py:60:log_dist] [Rank 0] step=23270, skipped=28, lr=[0.0005662103610580422, 0.0005662103610580422], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23270 loss: 2.3210 iter time (s): 63.752 samples/sec: 16.062 %comms: 0.0028690264185384238 %optimizer_step 0.056737776998120694 %forward: 22.84874232659032 %backward: 61.20369700656762 [2025-04-10 22:39:32,745] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27998.68 | forward: 145666.42 | backward_microstep: 390201.14 | backward: 390188.82 | backward_inner_microstep: 390170.77 | backward_inner: 390163.97 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.96 | reduce_tied_grads: 0.36 | comms: 18.29 | reduce_grads: 0.23 | step: 361.72 | _step_clipping: 0.12 | _step_step: 359.82 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 16.062 | iteration 23270/ 143000 | elapsed time per iteration (ms): 63753.1 | learning rate: 5.662E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.331581E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 22:50:08,787] [INFO] [logging.py:60:log_dist] [Rank 0] step=23280, skipped=28, lr=[0.0005661799671658649, 0.0005661799671658649], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23280 loss: 2.3230 iter time (s): 63.604 samples/sec: 16.100 %comms: 0.0028434665920837954 %optimizer_step 0.05719241556026337 %forward: 22.8832537803807 %backward: 61.3304692414625 [2025-04-10 22:50:08,788] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26745.86 | forward: 145545.80 | backward_microstep: 390096.02 | backward: 390084.05 | backward_inner_microstep: 390064.99 | backward_inner: 390058.01 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.20 | reduce_tied_grads: 0.32 | comms: 18.09 | reduce_grads: 0.22 | step: 363.76 | _step_clipping: 0.13 | _step_step: 361.83 | _step_zero_grad: 0.56 | _step_check_overflow: 0.61 samples/sec: 16.100 | iteration 23280/ 143000 | elapsed time per iteration (ms): 63604.2 | learning rate: 5.662E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.334217E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 23:00:44,578] [INFO] [logging.py:60:log_dist] [Rank 0] step=23290, skipped=28, lr=[0.0005661495604266484, 0.0005661495604266484], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23290 loss: 2.3276 iter time (s): 63.579 samples/sec: 16.106 %comms: 0.002860977806291948 %optimizer_step 0.056066885031376794 %forward: 22.889204508152826 %backward: 61.386144620895486 [2025-04-10 23:00:44,579] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26324.12 | forward: 145526.14 | backward_microstep: 390297.27 | backward: 390283.92 | backward_inner_microstep: 390266.08 | backward_inner: 390259.12 | backward_allreduce_microstep: 8.48 | backward_allreduce: 2.94 | reduce_tied_grads: 0.32 | comms: 18.19 | reduce_grads: 0.21 | step: 356.46 | _step_clipping: 0.13 | _step_step: 354.58 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.106 | iteration 23290/ 143000 | elapsed time per iteration (ms): 63579.2 | learning rate: 5.661E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.332392E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 23:11:16,512] [INFO] [logging.py:60:log_dist] [Rank 0] step=23300, skipped=28, lr=[0.0005661191408418608, 0.0005661191408418608], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23300 loss: 2.3237 iter time (s): 63.193 samples/sec: 16.204 %comms: 0.0028633538715133297 %optimizer_step 0.05721999185410477 %forward: 23.02916005355157 %backward: 61.734382098843874 [2025-04-10 23:11:16,512] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22661.24 | forward: 145527.44 | backward_microstep: 390125.94 | backward: 390116.12 | backward_inner_microstep: 390098.30 | backward_inner: 390091.75 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.91 | reduce_tied_grads: 0.37 | comms: 18.09 | reduce_grads: 0.22 | step: 361.59 | _step_clipping: 0.14 | _step_step: 359.59 | _step_zero_grad: 0.51 | _step_check_overflow: 0.73 samples/sec: 16.204 | iteration 23300/ 143000 | elapsed time per iteration (ms): 63193.3 | learning rate: 5.661E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.329880E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 23:22:00,883] [INFO] [logging.py:60:log_dist] [Rank 0] step=23310, skipped=28, lr=[0.0005660887084129702, 0.0005660887084129702], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23310 loss: 2.3315 iter time (s): 64.437 samples/sec: 15.892 %comms: 0.0028315032637235824 %optimizer_step 0.05769172174607634 %forward: 22.686748007793657 %backward: 60.58412138524768 [2025-04-10 23:22:00,884] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34122.59 | forward: 146185.50 | backward_microstep: 390396.18 | backward: 390382.96 | backward_inner_microstep: 390363.69 | backward_inner: 390356.32 | backward_allreduce_microstep: 9.14 | backward_allreduce: 3.16 | reduce_tied_grads: 0.36 | comms: 18.25 | reduce_grads: 0.25 | step: 371.75 | _step_clipping: 0.16 | _step_step: 369.70 | _step_zero_grad: 0.59 | _step_check_overflow: 0.65 samples/sec: 15.891 | iteration 23310/ 143000 | elapsed time per iteration (ms): 64437.1 | learning rate: 5.661E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.333730E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 23:32:39,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=23320, skipped=28, lr=[0.0005660582631414452, 0.0005660582631414452], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23320 loss: 2.3291 iter time (s): 63.819 samples/sec: 16.045 %comms: 0.0028589735275459213 %optimizer_step 0.05706776854841788 %forward: 22.828895548011864 %backward: 61.17630493260532 [2025-04-10 23:32:39,081] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28401.85 | forward: 145691.84 | backward_microstep: 390435.87 | backward: 390421.36 | backward_inner_microstep: 390403.03 | backward_inner: 390390.47 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.90 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.22 | step: 364.20 | _step_clipping: 0.13 | _step_step: 362.22 | _step_zero_grad: 0.56 | _step_check_overflow: 0.65 samples/sec: 16.045 | iteration 23320/ 143000 | elapsed time per iteration (ms): 63819.7 | learning rate: 5.661E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.329699E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 23:43:18,016] [INFO] [logging.py:60:log_dist] [Rank 0] step=23330, skipped=28, lr=[0.0005660278050287556, 0.0005660278050287556], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23330 loss: 2.3562 iter time (s): 63.893 samples/sec: 16.027 %comms: 0.0028762612370074768 %optimizer_step 0.05749768606873138 %forward: 22.775070508290334 %backward: 61.07015850293549 [2025-04-10 23:43:18,017] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29571.79 | forward: 145516.82 | backward_microstep: 390206.97 | backward: 390195.74 | backward_inner_microstep: 390174.18 | backward_inner: 390167.29 | backward_allreduce_microstep: 10.45 | backward_allreduce: 4.72 | reduce_tied_grads: 0.38 | comms: 18.38 | reduce_grads: 0.23 | step: 367.37 | _step_clipping: 0.13 | _step_step: 363.74 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.027 | iteration 23330/ 143000 | elapsed time per iteration (ms): 63893.6 | learning rate: 5.660E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.332563E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-10 23:53:51,065] [INFO] [logging.py:60:log_dist] [Rank 0] step=23340, skipped=28, lr=[0.000565997334076371, 0.000565997334076371], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23340 loss: 2.3316 iter time (s): 63.304 samples/sec: 16.176 %comms: 0.0028926532161991808 %optimizer_step 0.0567453406550187 %forward: 22.992218667562796 %backward: 61.667507237807776 [2025-04-10 23:53:51,066] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23436.90 | forward: 145550.63 | backward_microstep: 390397.35 | backward: 390381.84 | backward_inner_microstep: 390363.55 | backward_inner: 390356.52 | backward_allreduce_microstep: 8.70 | backward_allreduce: 3.12 | reduce_tied_grads: 0.37 | comms: 18.31 | reduce_grads: 0.21 | step: 359.22 | _step_clipping: 0.13 | _step_step: 357.33 | _step_zero_grad: 0.54 | _step_check_overflow: 0.56 samples/sec: 16.176 | iteration 23340/ 143000 | elapsed time per iteration (ms): 63304.9 | learning rate: 5.660E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.329895E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 00:04:22,108] [INFO] [logging.py:60:log_dist] [Rank 0] step=23350, skipped=28, lr=[0.0005659668502857625, 0.0005659668502857625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23350 loss: 2.3257 iter time (s): 63.104 samples/sec: 16.227 %comms: 0.00316541574384152 %optimizer_step 0.059059424033788076 %forward: 23.03368391921896 %backward: 61.835921975941964 [2025-04-11 00:04:22,108] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21831.88 | forward: 145351.09 | backward_microstep: 390224.22 | backward: 390207.59 | backward_inner_microstep: 390185.60 | backward_inner: 390177.11 | backward_allreduce_microstep: 10.64 | backward_allreduce: 4.95 | reduce_tied_grads: 0.35 | comms: 19.97 | reduce_grads: 0.22 | step: 372.69 | _step_clipping: 0.12 | _step_step: 370.89 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.227 | iteration 23350/ 143000 | elapsed time per iteration (ms): 63104.3 | learning rate: 5.660E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.339505E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 00:14:56,701] [INFO] [logging.py:60:log_dist] [Rank 0] step=23360, skipped=28, lr=[0.000565936353658401, 0.000565936353658401], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23360 loss: 2.3274 iter time (s): 63.459 samples/sec: 16.136 %comms: 0.0028894485014538864 %optimizer_step 0.05637379512036638 %forward: 22.941648298914018 %backward: 61.48726092868617 [2025-04-11 00:14:56,701] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25188.05 | forward: 145584.66 | backward_microstep: 390202.09 | backward: 390190.02 | backward_inner_microstep: 390171.50 | backward_inner: 390164.13 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.07 | reduce_tied_grads: 0.32 | comms: 18.34 | reduce_grads: 0.22 | step: 357.74 | _step_clipping: 0.12 | _step_step: 355.78 | _step_zero_grad: 0.60 | _step_check_overflow: 0.59 samples/sec: 16.136 | iteration 23360/ 143000 | elapsed time per iteration (ms): 63459.3 | learning rate: 5.659E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.331091E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 00:25:23,434] [INFO] [logging.py:60:log_dist] [Rank 0] step=23370, skipped=28, lr=[0.0005659058441957588, 0.0005659058441957588], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23370 loss: 2.3317 iter time (s): 62.673 samples/sec: 16.339 %comms: 0.0028752006596092254 %optimizer_step 0.0600896395276214 %forward: 23.205264283786413 %backward: 62.22950339589368 [2025-04-11 00:25:23,435] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17674.66 | forward: 145433.80 | backward_microstep: 390018.81 | backward: 390009.48 | backward_inner_microstep: 389992.44 | backward_inner: 389985.70 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.83 | reduce_tied_grads: 0.30 | comms: 18.02 | reduce_grads: 0.22 | step: 376.60 | _step_clipping: 0.13 | _step_step: 374.67 | _step_zero_grad: 0.53 | _step_check_overflow: 0.66 samples/sec: 16.339 | iteration 23370/ 143000 | elapsed time per iteration (ms): 62673.4 | learning rate: 5.659E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.330272E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 00:36:13,495] [INFO] [logging.py:60:log_dist] [Rank 0] step=23380, skipped=28, lr=[0.000565875321899308, 0.000565875321899308], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23380 loss: 2.3061 iter time (s): 65.005 samples/sec: 15.753 %comms: 0.0027681029410101094 %optimizer_step 0.05899547588264978 %forward: 22.47758179178452 %backward: 60.04260477670324 [2025-04-11 00:36:13,496] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39951.68 | forward: 146116.43 | backward_microstep: 390324.08 | backward: 390309.39 | backward_inner_microstep: 390286.69 | backward_inner: 390279.26 | backward_allreduce_microstep: 9.05 | backward_allreduce: 3.12 | reduce_tied_grads: 0.29 | comms: 17.99 | reduce_grads: 0.20 | step: 383.50 | _step_clipping: 0.11 | _step_step: 381.65 | _step_zero_grad: 0.61 | _step_check_overflow: 0.49 samples/sec: 15.752 | iteration 23380/ 143000 | elapsed time per iteration (ms): 65006.1 | learning rate: 5.659E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.336959E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 00:46:58,268] [INFO] [logging.py:60:log_dist] [Rank 0] step=23390, skipped=28, lr=[0.0005658447867705219, 0.0005658447867705219], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23390 loss: 2.3416 iter time (s): 64.477 samples/sec: 15.882 %comms: 0.0028042993672870086 %optimizer_step 0.06156154104848979 %forward: 22.58667843702776 %backward: 60.522887882146314 [2025-04-11 00:46:58,268] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35235.47 | forward: 145631.38 | backward_microstep: 390244.52 | backward: 390231.42 | backward_inner_microstep: 390211.12 | backward_inner: 390203.60 | backward_allreduce_microstep: 9.83 | backward_allreduce: 3.40 | reduce_tied_grads: 0.34 | comms: 18.08 | reduce_grads: 0.23 | step: 396.93 | _step_clipping: 0.13 | _step_step: 394.81 | _step_zero_grad: 0.59 | _step_check_overflow: 0.71 samples/sec: 15.882 | iteration 23390/ 143000 | elapsed time per iteration (ms): 64477.3 | learning rate: 5.658E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.327386E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 00:57:30,309] [INFO] [logging.py:60:log_dist] [Rank 0] step=23400, skipped=28, lr=[0.0005658142388108745, 0.0005658142388108745], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23400 loss: 2.3340 iter time (s): 63.203 samples/sec: 16.202 %comms: 0.002906247986613275 %optimizer_step 0.058994763167688945 %forward: 22.991332914623584 %backward: 61.72850990971528 [2025-04-11 00:57:30,309] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22955.22 | forward: 145313.10 | backward_microstep: 390156.28 | backward: 390145.32 | backward_inner_microstep: 390122.35 | backward_inner: 390115.27 | backward_allreduce_microstep: 11.23 | backward_allreduce: 4.90 | reduce_tied_grads: 0.40 | comms: 18.37 | reduce_grads: 0.27 | step: 372.87 | _step_clipping: 0.15 | _step_step: 370.90 | _step_zero_grad: 0.59 | _step_check_overflow: 0.57 samples/sec: 16.201 | iteration 23400/ 143000 | elapsed time per iteration (ms): 63204.1 | learning rate: 5.658E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.334185E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 01:08:07,329] [INFO] [logging.py:60:log_dist] [Rank 0] step=23410, skipped=28, lr=[0.0005657836780218399, 0.0005657836780218399], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23410 loss: 2.3563 iter time (s): 63.701 samples/sec: 16.075 %comms: 0.002824390025435492 %optimizer_step 0.05505874441564201 %forward: 22.818979508730923 %backward: 61.24641054547133 [2025-04-11 01:08:07,330] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27906.87 | forward: 145360.27 | backward_microstep: 390160.99 | backward: 390148.67 | backward_inner_microstep: 390130.32 | backward_inner: 390121.74 | backward_allreduce_microstep: 8.87 | backward_allreduce: 3.07 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.20 | step: 350.73 | _step_clipping: 0.12 | _step_step: 349.05 | _step_zero_grad: 0.48 | _step_check_overflow: 0.51 samples/sec: 16.075 | iteration 23410/ 143000 | elapsed time per iteration (ms): 63702.0 | learning rate: 5.658E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.343656E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 01:18:35,298] [INFO] [logging.py:60:log_dist] [Rank 0] step=23420, skipped=28, lr=[0.0005657531044048932, 0.0005657531044048932], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23420 loss: 2.3396 iter time (s): 62.796 samples/sec: 16.307 %comms: 0.0028516204992972906 %optimizer_step 0.05508333944816121 %forward: 23.156549562765726 %backward: 62.13719198160885 [2025-04-11 01:18:35,299] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18733.55 | forward: 145414.73 | backward_microstep: 390211.84 | backward: 390199.03 | backward_inner_microstep: 390181.45 | backward_inner: 390174.71 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.90 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.19 | step: 345.90 | _step_clipping: 0.14 | _step_step: 344.22 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.307 | iteration 23420/ 143000 | elapsed time per iteration (ms): 62796.9 | learning rate: 5.658E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.339392E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 01:29:17,221] [INFO] [logging.py:60:log_dist] [Rank 0] step=23430, skipped=28, lr=[0.00056572251796151, 0.00056572251796151], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23430 loss: 2.3411 iter time (s): 64.192 samples/sec: 15.952 %comms: 0.0028538880316575465 %optimizer_step 0.05714468983788097 %forward: 22.677162105403788 %backward: 60.78482539650022 [2025-04-11 01:29:17,222] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32517.36 | forward: 145568.67 | backward_microstep: 390201.10 | backward: 390188.43 | backward_inner_microstep: 390169.62 | backward_inner: 390162.67 | backward_allreduce_microstep: 8.97 | backward_allreduce: 3.07 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.24 | step: 366.82 | _step_clipping: 0.14 | _step_step: 364.76 | _step_zero_grad: 0.61 | _step_check_overflow: 0.64 samples/sec: 15.952 | iteration 23430/ 143000 | elapsed time per iteration (ms): 64192.4 | learning rate: 5.657E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.343336E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 01:39:57,943] [INFO] [logging.py:60:log_dist] [Rank 0] step=23440, skipped=28, lr=[0.0005656919186931665, 0.0005656919186931665], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23440 loss: 2.3493 iter time (s): 64.072 samples/sec: 15.982 %comms: 0.0028331584861340455 %optimizer_step 0.05741543382496608 %forward: 22.7348701799687 %backward: 60.890876302145216 [2025-04-11 01:39:57,944] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31257.67 | forward: 145665.76 | backward_microstep: 390149.15 | backward: 390137.06 | backward_inner_microstep: 390117.35 | backward_inner: 390110.25 | backward_allreduce_microstep: 9.45 | backward_allreduce: 3.26 | reduce_tied_grads: 0.34 | comms: 18.15 | reduce_grads: 0.23 | step: 367.87 | _step_clipping: 0.14 | _step_step: 365.88 | _step_zero_grad: 0.56 | _step_check_overflow: 0.66 samples/sec: 15.982 | iteration 23440/ 143000 | elapsed time per iteration (ms): 64072.1 | learning rate: 5.657E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.349757E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 01:50:32,287] [INFO] [logging.py:60:log_dist] [Rank 0] step=23450, skipped=28, lr=[0.0005656613066013395, 0.0005656613066013395], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23450 loss: 2.3456 iter time (s): 63.434 samples/sec: 16.143 %comms: 0.0028726503980973836 %optimizer_step 0.05669036306821753 %forward: 22.92455668384633 %backward: 61.49462486274003 [2025-04-11 01:50:32,288] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25168.77 | forward: 145419.33 | backward_microstep: 390095.16 | backward: 390084.18 | backward_inner_microstep: 390064.97 | backward_inner: 390058.12 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.23 | step: 359.61 | _step_clipping: 0.15 | _step_step: 357.69 | _step_zero_grad: 0.52 | _step_check_overflow: 0.64 samples/sec: 16.143 | iteration 23450/ 143000 | elapsed time per iteration (ms): 63434.4 | learning rate: 5.657E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.339956E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 02:01:10,644] [INFO] [logging.py:60:log_dist] [Rank 0] step=23460, skipped=28, lr=[0.0005656306816875068, 0.0005656306816875068], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23460 loss: 2.3260 iter time (s): 63.835 samples/sec: 16.041 %comms: 0.002853329952453314 %optimizer_step 0.05734137715865577 %forward: 22.796325061762637 %backward: 61.121779544578494 [2025-04-11 02:01:10,644] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29007.47 | forward: 145520.29 | backward_microstep: 390182.39 | backward: 390170.75 | backward_inner_microstep: 390149.06 | backward_inner: 390141.64 | backward_allreduce_microstep: 11.47 | backward_allreduce: 5.05 | reduce_tied_grads: 0.35 | comms: 18.21 | reduce_grads: 0.23 | step: 366.04 | _step_clipping: 0.14 | _step_step: 364.01 | _step_zero_grad: 0.59 | _step_check_overflow: 0.59 samples/sec: 16.041 | iteration 23460/ 143000 | elapsed time per iteration (ms): 63835.6 | learning rate: 5.656E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.330529E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 02:11:53,723] [INFO] [logging.py:60:log_dist] [Rank 0] step=23470, skipped=28, lr=[0.0005656000439531462, 0.0005656000439531462], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23470 loss: 2.3210 iter time (s): 64.307 samples/sec: 15.924 %comms: 0.002813608712541918 %optimizer_step 0.057411112973322055 %forward: 22.62708768394702 %backward: 60.673430212252 [2025-04-11 02:11:53,724] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33743.34 | forward: 145508.92 | backward_microstep: 390188.07 | backward: 390175.05 | backward_inner_microstep: 390156.11 | backward_inner: 390149.14 | backward_allreduce_microstep: 9.21 | backward_allreduce: 3.15 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.23 | step: 369.20 | _step_clipping: 0.13 | _step_step: 367.15 | _step_zero_grad: 0.57 | _step_check_overflow: 0.70 samples/sec: 15.923 | iteration 23470/ 143000 | elapsed time per iteration (ms): 64308.0 | learning rate: 5.656E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.331277E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 02:22:33,290] [INFO] [logging.py:60:log_dist] [Rank 0] step=23480, skipped=28, lr=[0.0005655693933997365, 0.0005655693933997365], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23480 loss: 2.3553 iter time (s): 63.956 samples/sec: 16.011 %comms: 0.0028555388101399897 %optimizer_step 0.05598332299426358 %forward: 22.739992466663644 %backward: 61.01455050499658 [2025-04-11 02:22:33,290] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30278.23 | forward: 145435.73 | backward_microstep: 390236.27 | backward: 390224.21 | backward_inner_microstep: 390203.89 | backward_inner: 390197.03 | backward_allreduce_microstep: 9.11 | backward_allreduce: 3.07 | reduce_tied_grads: 0.32 | comms: 18.26 | reduce_grads: 0.21 | step: 358.05 | _step_clipping: 0.13 | _step_step: 355.74 | _step_zero_grad: 0.76 | _step_check_overflow: 0.57 samples/sec: 16.011 | iteration 23480/ 143000 | elapsed time per iteration (ms): 63956.6 | learning rate: 5.656E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.330862E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 02:33:05,262] [INFO] [logging.py:60:log_dist] [Rank 0] step=23490, skipped=28, lr=[0.000565538730028757, 0.000565538730028757], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23490 loss: 2.3067 iter time (s): 63.197 samples/sec: 16.203 %comms: 0.0028536285167214386 %optimizer_step 0.055736110918711466 %forward: 22.98685993517458 %backward: 61.71481902526944 [2025-04-11 02:33:05,263] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23061.28 | forward: 145269.29 | backward_microstep: 390027.85 | backward: 390017.09 | backward_inner_microstep: 389998.53 | backward_inner: 389991.71 | backward_allreduce_microstep: 9.21 | backward_allreduce: 3.04 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.22 | step: 352.23 | _step_clipping: 0.14 | _step_step: 350.44 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.203 | iteration 23490/ 143000 | elapsed time per iteration (ms): 63197.2 | learning rate: 5.655E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.333782E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 02:43:29,581] [INFO] [logging.py:60:log_dist] [Rank 0] step=23500, skipped=28, lr=[0.0005655080538416877, 0.0005655080538416877], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23500 loss: 2.3340 iter time (s): 62.431 samples/sec: 16.402 %comms: 0.0028622184485836946 %optimizer_step 0.055038655577855464 %forward: 23.248299443874227 %backward: 62.444691468373115 [2025-04-11 02:43:29,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15764.75 | forward: 145142.42 | backward_microstep: 389859.01 | backward: 389851.03 | backward_inner_microstep: 389835.01 | backward_inner: 389829.00 | backward_allreduce_microstep: 7.77 | backward_allreduce: 2.67 | reduce_tied_grads: 0.26 | comms: 17.87 | reduce_grads: 0.20 | step: 343.61 | _step_clipping: 0.11 | _step_step: 342.02 | _step_zero_grad: 0.45 | _step_check_overflow: 0.48 samples/sec: 16.402 | iteration 23500/ 143000 | elapsed time per iteration (ms): 62431.9 | learning rate: 5.655E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.333395E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 02:54:01,875] [INFO] [logging.py:60:log_dist] [Rank 0] step=23510, skipped=28, lr=[0.0005654773648400091, 0.0005654773648400091], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23510 loss: 2.3590 iter time (s): 63.229 samples/sec: 16.195 %comms: 0.00285711601281316 %optimizer_step 0.05570254434308739 %forward: 22.985284862341636 %backward: 61.68081572826185 [2025-04-11 02:54:01,876] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23335.94 | forward: 145333.30 | backward_microstep: 390009.43 | backward: 390000.67 | backward_inner_microstep: 389983.63 | backward_inner: 389977.32 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.20 | step: 352.20 | _step_clipping: 0.10 | _step_step: 350.33 | _step_zero_grad: 0.51 | _step_check_overflow: 0.67 samples/sec: 16.195 | iteration 23510/ 143000 | elapsed time per iteration (ms): 63229.4 | learning rate: 5.655E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.339104E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 03:04:43,398] [INFO] [logging.py:60:log_dist] [Rank 0] step=23520, skipped=28, lr=[0.0005654466630252025, 0.0005654466630252025], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23520 loss: 2.3402 iter time (s): 64.152 samples/sec: 15.962 %comms: 0.002814787352342726 %optimizer_step 0.056133708534400875 %forward: 22.704536259273752 %backward: 60.84124548900158 [2025-04-11 03:04:43,399] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31886.37 | forward: 145653.52 | backward_microstep: 390322.20 | backward: 390307.08 | backward_inner_microstep: 390286.85 | backward_inner: 390279.45 | backward_allreduce_microstep: 9.78 | backward_allreduce: 3.36 | reduce_tied_grads: 0.51 | comms: 18.06 | reduce_grads: 0.22 | step: 360.11 | _step_clipping: 0.13 | _step_step: 358.21 | _step_zero_grad: 0.56 | _step_check_overflow: 0.58 samples/sec: 15.962 | iteration 23520/ 143000 | elapsed time per iteration (ms): 64152.3 | learning rate: 5.654E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.327699E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 03:15:18,090] [INFO] [logging.py:60:log_dist] [Rank 0] step=23530, skipped=28, lr=[0.0005654159483987496, 0.0005654159483987496], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23530 loss: 2.3311 iter time (s): 63.469 samples/sec: 16.134 %comms: 0.002924457830945761 %optimizer_step 0.05735627715345347 %forward: 22.915428025050808 %backward: 61.482040474756324 [2025-04-11 03:15:18,091] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25359.48 | forward: 145441.02 | backward_microstep: 390232.03 | backward: 390217.91 | backward_inner_microstep: 390200.22 | backward_inner: 390193.12 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.56 | reduce_grads: 0.20 | step: 364.03 | _step_clipping: 0.12 | _step_step: 362.08 | _step_zero_grad: 0.54 | _step_check_overflow: 0.65 samples/sec: 16.134 | iteration 23530/ 143000 | elapsed time per iteration (ms): 63469.2 | learning rate: 5.654E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.336285E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 03:25:51,233] [INFO] [logging.py:60:log_dist] [Rank 0] step=23540, skipped=28, lr=[0.0005653852209621329, 0.0005653852209621329], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23540 loss: 2.3339 iter time (s): 63.314 samples/sec: 16.173 %comms: 0.0029262286498024552 %optimizer_step 0.05728999732451241 %forward: 22.986738590686727 %backward: 61.66409403339289 [2025-04-11 03:25:51,234] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23494.47 | forward: 145537.51 | backward_microstep: 390433.14 | backward: 390418.08 | backward_inner_microstep: 390399.63 | backward_inner: 390390.66 | backward_allreduce_microstep: 8.65 | backward_allreduce: 3.00 | reduce_tied_grads: 0.32 | comms: 18.53 | reduce_grads: 0.19 | step: 362.72 | _step_clipping: 0.13 | _step_step: 360.70 | _step_zero_grad: 0.61 | _step_check_overflow: 0.63 samples/sec: 16.173 | iteration 23540/ 143000 | elapsed time per iteration (ms): 63314.3 | learning rate: 5.654E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.327888E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 03:36:29,810] [INFO] [logging.py:60:log_dist] [Rank 0] step=23550, skipped=28, lr=[0.0005653544807168354, 0.0005653544807168354], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23550 loss: 2.3249 iter time (s): 63.857 samples/sec: 16.036 %comms: 0.0029210794451476705 %optimizer_step 0.05930604831443523 %forward: 22.804762835996577 %backward: 61.127449306371936 [2025-04-11 03:36:29,811] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28869.46 | forward: 145624.51 | backward_microstep: 390357.34 | backward: 390341.92 | backward_inner_microstep: 390320.98 | backward_inner: 390313.58 | backward_allreduce_microstep: 10.81 | backward_allreduce: 3.19 | reduce_tied_grads: 0.43 | comms: 18.65 | reduce_grads: 0.26 | step: 378.71 | _step_clipping: 0.14 | _step_step: 376.60 | _step_zero_grad: 0.57 | _step_check_overflow: 0.69 samples/sec: 16.036 | iteration 23550/ 143000 | elapsed time per iteration (ms): 63857.7 | learning rate: 5.654E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.325476E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 03:47:13,207] [INFO] [logging.py:60:log_dist] [Rank 0] step=23560, skipped=28, lr=[0.0005653237276643408, 0.0005653237276643408], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23560 loss: 2.3333 iter time (s): 64.339 samples/sec: 15.916 %comms: 0.0028581404733824057 %optimizer_step 0.057571284651276755 %forward: 22.61877878866939 %backward: 60.648433746821006 [2025-04-11 03:47:13,207] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33994.14 | forward: 145526.93 | backward_microstep: 390219.66 | backward: 390205.89 | backward_inner_microstep: 390185.90 | backward_inner: 390178.44 | backward_allreduce_microstep: 9.71 | backward_allreduce: 3.36 | reduce_tied_grads: 0.36 | comms: 18.39 | reduce_grads: 0.24 | step: 370.41 | _step_clipping: 0.14 | _step_step: 368.37 | _step_zero_grad: 0.65 | _step_check_overflow: 0.55 samples/sec: 15.916 | iteration 23560/ 143000 | elapsed time per iteration (ms): 64339.6 | learning rate: 5.653E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.336096E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 03:57:47,644] [INFO] [logging.py:60:log_dist] [Rank 0] step=23570, skipped=28, lr=[0.0005652929618061333, 0.0005652929618061333], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23570 loss: 2.3319 iter time (s): 63.443 samples/sec: 16.140 %comms: 0.0028393451715055425 %optimizer_step 0.05636747478264645 %forward: 22.93562672553494 %backward: 61.52286635094987 [2025-04-11 03:57:47,645] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24906.08 | forward: 145510.97 | backward_microstep: 390340.76 | backward: 390320.78 | backward_inner_microstep: 390302.32 | backward_inner: 390295.10 | backward_allreduce_microstep: 8.72 | backward_allreduce: 3.00 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.20 | step: 357.61 | _step_clipping: 0.12 | _step_step: 355.72 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 16.140 | iteration 23570/ 143000 | elapsed time per iteration (ms): 63443.8 | learning rate: 5.653E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.332953E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 04:08:23,252] [INFO] [logging.py:60:log_dist] [Rank 0] step=23580, skipped=28, lr=[0.0005652621831436979, 0.0005652621831436979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23580 loss: 2.3389 iter time (s): 63.560 samples/sec: 16.111 %comms: 0.0029240333370739247 %optimizer_step 0.05852039056259868 %forward: 22.906046038270212 %backward: 61.400560511293335 [2025-04-11 04:08:23,253] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26069.73 | forward: 145591.21 | backward_microstep: 390277.66 | backward: 390263.00 | backward_inner_microstep: 390244.60 | backward_inner: 390237.56 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.03 | reduce_tied_grads: 0.36 | comms: 18.59 | reduce_grads: 0.22 | step: 371.96 | _step_clipping: 0.14 | _step_step: 369.92 | _step_zero_grad: 0.56 | _step_check_overflow: 0.65 samples/sec: 16.111 | iteration 23580/ 143000 | elapsed time per iteration (ms): 63560.8 | learning rate: 5.653E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.326702E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 04:18:51,095] [INFO] [logging.py:60:log_dist] [Rank 0] step=23590, skipped=28, lr=[0.00056523139167852, 0.00056523139167852], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23590 loss: 2.3380 iter time (s): 62.784 samples/sec: 16.310 %comms: 0.0029843117189513107 %optimizer_step 0.05685009964007603 %forward: 23.187612943103964 %backward: 62.189967822783984 [2025-04-11 04:18:51,095] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18111.76 | forward: 145580.32 | backward_microstep: 390468.07 | backward: 390451.37 | backward_inner_microstep: 390432.81 | backward_inner: 390425.73 | backward_allreduce_microstep: 9.52 | backward_allreduce: 4.42 | reduce_tied_grads: 0.48 | comms: 18.74 | reduce_grads: 0.19 | step: 356.93 | _step_clipping: 0.26 | _step_step: 354.97 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.310 | iteration 23590/ 143000 | elapsed time per iteration (ms): 62784.2 | learning rate: 5.652E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.336378E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 04:29:21,203] [INFO] [logging.py:60:log_dist] [Rank 0] step=23600, skipped=28, lr=[0.0005652005874120858, 0.0005652005874120858], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23600 loss: 2.3320 iter time (s): 63.010 samples/sec: 16.251 %comms: 0.002945806319769486 %optimizer_step 0.0594794594010379 %forward: 23.102996857654485 %backward: 61.96600640685953 [2025-04-11 04:29:21,203] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20357.70 | forward: 145572.58 | backward_microstep: 390465.66 | backward: 390449.41 | backward_inner_microstep: 390431.69 | backward_inner: 390424.73 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.84 | reduce_tied_grads: 0.33 | comms: 18.56 | reduce_grads: 0.21 | step: 374.78 | _step_clipping: 0.12 | _step_step: 372.92 | _step_zero_grad: 0.53 | _step_check_overflow: 0.57 samples/sec: 16.251 | iteration 23600/ 143000 | elapsed time per iteration (ms): 63010.8 | learning rate: 5.652E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.334751E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 04:39:49,872] [INFO] [logging.py:60:log_dist] [Rank 0] step=23610, skipped=28, lr=[0.0005651697703458822, 0.0005651697703458822], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23610 loss: 2.3399 iter time (s): 62.866 samples/sec: 16.289 %comms: 0.0028594810475567113 %optimizer_step 0.055914859529211866 %forward: 23.116236876825035 %backward: 62.059825858518195 [2025-04-11 04:39:49,873] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19524.35 | forward: 145323.42 | backward_microstep: 390160.75 | backward: 390147.66 | backward_inner_microstep: 390130.80 | backward_inner: 390124.36 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.73 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.39 | step: 351.52 | _step_clipping: 0.10 | _step_step: 349.78 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.288 | iteration 23610/ 143000 | elapsed time per iteration (ms): 62866.9 | learning rate: 5.652E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.332870E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 04:50:20,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=23620, skipped=28, lr=[0.0005651389404813963, 0.0005651389404813963], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23620 loss: 2.3391 iter time (s): 63.097 samples/sec: 16.229 %comms: 0.002843492363188403 %optimizer_step 0.05502591509145063 %forward: 23.0415667492683 %backward: 61.845756289369945 [2025-04-11 04:50:20,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21696.73 | forward: 145384.49 | backward_microstep: 390239.45 | backward: 390225.80 | backward_inner_microstep: 390207.71 | backward_inner: 390200.86 | backward_allreduce_microstep: 8.77 | backward_allreduce: 2.96 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 347.19 | _step_clipping: 0.11 | _step_step: 345.61 | _step_zero_grad: 0.50 | _step_check_overflow: 0.40 samples/sec: 16.229 | iteration 23620/ 143000 | elapsed time per iteration (ms): 63097.1 | learning rate: 5.651E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.350475E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 05:00:50,450] [INFO] [logging.py:60:log_dist] [Rank 0] step=23630, skipped=28, lr=[0.0005651080978201163, 0.0005651080978201163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23630 loss: 2.3223 iter time (s): 62.960 samples/sec: 16.264 %comms: 0.00287260540120432 %optimizer_step 0.05562258200509851 %forward: 23.091071953470568 %backward: 62.00361957671864 [2025-04-11 05:00:50,451] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20124.84 | forward: 145381.65 | backward_microstep: 390387.88 | backward: 390375.50 | backward_inner_microstep: 390358.03 | backward_inner: 390351.53 | backward_allreduce_microstep: 8.25 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 18.09 | reduce_grads: 0.22 | step: 350.20 | _step_clipping: 0.11 | _step_step: 348.50 | _step_zero_grad: 0.53 | _step_check_overflow: 0.44 samples/sec: 16.264 | iteration 23630/ 143000 | elapsed time per iteration (ms): 62960.7 | learning rate: 5.651E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.336903E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 05:11:26,319] [INFO] [logging.py:60:log_dist] [Rank 0] step=23640, skipped=28, lr=[0.0005650772423635306, 0.0005650772423635306], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23640 loss: 2.3268 iter time (s): 63.586 samples/sec: 16.104 %comms: 0.0028785907731011242 %optimizer_step 0.05768137678957681 %forward: 22.860229374468258 %backward: 61.36042003461277 [2025-04-11 05:11:26,319] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26654.50 | forward: 145359.56 | backward_microstep: 390177.68 | backward: 390167.73 | backward_inner_microstep: 390149.21 | backward_inner: 390142.49 | backward_allreduce_microstep: 9.02 | backward_allreduce: 3.09 | reduce_tied_grads: 0.39 | comms: 18.30 | reduce_grads: 0.25 | step: 366.77 | _step_clipping: 0.15 | _step_step: 364.79 | _step_zero_grad: 0.57 | _step_check_overflow: 0.60 samples/sec: 16.104 | iteration 23640/ 143000 | elapsed time per iteration (ms): 63586.8 | learning rate: 5.651E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.333717E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 05:22:06,898] [INFO] [logging.py:60:log_dist] [Rank 0] step=23650, skipped=28, lr=[0.0005650463741131286, 0.0005650463741131286], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23650 loss: 2.3352 iter time (s): 64.057 samples/sec: 15.986 %comms: 0.002881431376689994 %optimizer_step 0.058742387463997014 %forward: 22.764046610882854 %backward: 60.91666677848061 [2025-04-11 05:22:06,898] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30845.21 | forward: 145820.18 | backward_microstep: 390226.54 | backward: 390215.29 | backward_inner_microstep: 390196.19 | backward_inner: 390189.21 | backward_allreduce_microstep: 9.31 | backward_allreduce: 3.20 | reduce_tied_grads: 0.40 | comms: 18.46 | reduce_grads: 0.26 | step: 376.29 | _step_clipping: 0.12 | _step_step: 374.20 | _step_zero_grad: 0.62 | _step_check_overflow: 0.63 samples/sec: 15.986 | iteration 23650/ 143000 | elapsed time per iteration (ms): 64057.9 | learning rate: 5.650E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.333718E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 05:32:38,060] [INFO] [logging.py:60:log_dist] [Rank 0] step=23660, skipped=28, lr=[0.0005650154930704001, 0.0005650154930704001], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23660 loss: 2.3353 iter time (s): 63.116 samples/sec: 16.224 %comms: 0.00284947294507769 %optimizer_step 0.056995314008821005 %forward: 23.036970236712833 %backward: 61.83018719330957 [2025-04-11 05:32:38,060] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21848.89 | forward: 145399.29 | backward_microstep: 390254.81 | backward: 390245.14 | backward_inner_microstep: 390227.52 | backward_inner: 390221.00 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.23 | step: 359.73 | _step_clipping: 0.12 | _step_step: 358.06 | _step_zero_grad: 0.51 | _step_check_overflow: 0.46 samples/sec: 16.224 | iteration 23660/ 143000 | elapsed time per iteration (ms): 63116.2 | learning rate: 5.650E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.335632E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 05:43:09,754] [INFO] [logging.py:60:log_dist] [Rank 0] step=23670, skipped=28, lr=[0.0005649845992368355, 0.0005649845992368355], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23670 loss: 2.3250 iter time (s): 63.169 samples/sec: 16.211 %comms: 0.0029230854750698742 %optimizer_step 0.0583738059535333 %forward: 23.043596634540393 %backward: 61.81097783438631 [2025-04-11 05:43:09,755] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21944.35 | forward: 145563.82 | backward_microstep: 390465.54 | backward: 390453.03 | backward_inner_microstep: 390433.75 | backward_inner: 390425.07 | backward_allreduce_microstep: 9.40 | backward_allreduce: 3.24 | reduce_tied_grads: 0.40 | comms: 18.46 | reduce_grads: 0.25 | step: 368.74 | _step_clipping: 0.14 | _step_step: 366.62 | _step_zero_grad: 0.58 | _step_check_overflow: 0.72 samples/sec: 16.210 | iteration 23670/ 143000 | elapsed time per iteration (ms): 63169.5 | learning rate: 5.650E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.334363E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 05:53:44,640] [INFO] [logging.py:60:log_dist] [Rank 0] step=23680, skipped=28, lr=[0.0005649536926139259, 0.0005649536926139259], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23680 loss: 2.3079 iter time (s): 63.488 samples/sec: 16.129 %comms: 0.0028273155843725603 %optimizer_step 0.054570818430678306 %forward: 22.937730925252332 %backward: 61.47128397259143 [2025-04-11 05:53:44,641] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25308.66 | forward: 145627.04 | backward_microstep: 390279.23 | backward: 390268.82 | backward_inner_microstep: 390251.50 | backward_inner: 390244.95 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 17.95 | reduce_grads: 0.19 | step: 346.46 | _step_clipping: 0.11 | _step_step: 344.74 | _step_zero_grad: 0.47 | _step_check_overflow: 0.56 samples/sec: 16.129 | iteration 23680/ 143000 | elapsed time per iteration (ms): 63488.6 | learning rate: 5.650E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.329779E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 06:04:10,703] [INFO] [logging.py:60:log_dist] [Rank 0] step=23690, skipped=28, lr=[0.0005649227732031631, 0.0005649227732031631], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23690 loss: 2.3316 iter time (s): 62.606 samples/sec: 16.356 %comms: 0.002881630178696686 %optimizer_step 0.05542137855548676 %forward: 23.1899367237762 %backward: 62.303718705586896 [2025-04-11 06:04:10,704] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17179.15 | forward: 145182.30 | backward_microstep: 390065.42 | backward: 390057.00 | backward_inner_microstep: 390039.51 | backward_inner: 390033.56 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.58 | reduce_tied_grads: 0.29 | comms: 18.04 | reduce_grads: 0.19 | step: 346.97 | _step_clipping: 0.12 | _step_step: 345.24 | _step_zero_grad: 0.46 | _step_check_overflow: 0.59 samples/sec: 16.356 | iteration 23690/ 143000 | elapsed time per iteration (ms): 62606.3 | learning rate: 5.649E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.332766E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 06:14:36,649] [INFO] [logging.py:60:log_dist] [Rank 0] step=23700, skipped=28, lr=[0.0005648918410060392, 0.0005648918410060392], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23700 loss: 2.3249 iter time (s): 62.594 samples/sec: 16.359 %comms: 0.002869027107110733 %optimizer_step 0.05549620880424519 %forward: 23.20185569601152 %backward: 62.33444747923985 [2025-04-11 06:14:36,650] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16888.80 | forward: 145229.81 | backward_microstep: 390186.32 | backward: 390176.55 | backward_inner_microstep: 390160.30 | backward_inner: 390154.13 | backward_allreduce_microstep: 7.82 | backward_allreduce: 2.64 | reduce_tied_grads: 0.29 | comms: 17.96 | reduce_grads: 0.19 | step: 347.37 | _step_clipping: 0.11 | _step_step: 345.75 | _step_zero_grad: 0.46 | _step_check_overflow: 0.50 samples/sec: 16.359 | iteration 23700/ 143000 | elapsed time per iteration (ms): 62594.6 | learning rate: 5.649E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.326082E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 06:25:13,623] [INFO] [logging.py:60:log_dist] [Rank 0] step=23710, skipped=28, lr=[0.0005648608960240474, 0.0005648608960240474], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23710 loss: 2.3216 iter time (s): 63.697 samples/sec: 16.076 %comms: 0.0028692143719217765 %optimizer_step 0.05584373398308941 %forward: 22.83966756876169 %backward: 61.25386935576387 [2025-04-11 06:25:13,624] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27665.87 | forward: 145481.36 | backward_microstep: 390177.02 | backward: 390167.52 | backward_inner_microstep: 390150.22 | backward_inner: 390143.67 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.21 | step: 355.71 | _step_clipping: 0.12 | _step_step: 353.89 | _step_zero_grad: 0.59 | _step_check_overflow: 0.50 samples/sec: 16.076 | iteration 23710/ 143000 | elapsed time per iteration (ms): 63697.4 | learning rate: 5.649E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.331992E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 06:35:45,105] [INFO] [logging.py:60:log_dist] [Rank 0] step=23720, skipped=28, lr=[0.0005648299382586809, 0.0005648299382586809], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23720 loss: 2.3361 iter time (s): 63.148 samples/sec: 16.216 %comms: 0.0028582609816892764 %optimizer_step 0.0560896339468157 %forward: 23.030839807631367 %backward: 61.806203935502936 [2025-04-11 06:35:45,106] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22084.41 | forward: 145434.29 | backward_microstep: 390302.56 | backward: 390291.52 | backward_inner_microstep: 390273.74 | backward_inner: 390267.16 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.96 | reduce_tied_grads: 0.34 | comms: 18.05 | reduce_grads: 0.22 | step: 354.19 | _step_clipping: 0.13 | _step_step: 352.40 | _step_zero_grad: 0.53 | _step_check_overflow: 0.52 samples/sec: 16.216 | iteration 23720/ 143000 | elapsed time per iteration (ms): 63148.2 | learning rate: 5.648E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.333756E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 06:46:25,980] [INFO] [logging.py:60:log_dist] [Rank 0] step=23730, skipped=28, lr=[0.0005647989677114341, 0.0005647989677114341], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23730 loss: 2.3298 iter time (s): 64.087 samples/sec: 15.978 %comms: 0.0028199410196166103 %optimizer_step 0.05631336650681691 %forward: 22.705432202906948 %backward: 60.89467687628074 [2025-04-11 06:46:25,981] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31426.37 | forward: 145512.08 | backward_microstep: 390266.59 | backward: 390255.12 | backward_inner_microstep: 390235.86 | backward_inner: 390228.83 | backward_allreduce_microstep: 9.24 | backward_allreduce: 3.18 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.23 | step: 360.89 | _step_clipping: 0.13 | _step_step: 359.08 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 15.978 | iteration 23730/ 143000 | elapsed time per iteration (ms): 64087.5 | learning rate: 5.648E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.336394E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 06:56:59,461] [INFO] [logging.py:60:log_dist] [Rank 0] step=23740, skipped=28, lr=[0.0005647679843838018, 0.0005647679843838018], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23740 loss: 2.3056 iter time (s): 63.347 samples/sec: 16.165 %comms: 0.002886127005403148 %optimizer_step 0.057846326969999876 %forward: 22.967923587322332 %backward: 61.612884925907764 [2025-04-11 06:56:59,461] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23935.31 | forward: 145496.03 | backward_microstep: 390316.34 | backward: 390302.16 | backward_inner_microstep: 390282.48 | backward_inner: 390275.08 | backward_allreduce_microstep: 9.53 | backward_allreduce: 3.28 | reduce_tied_grads: 0.38 | comms: 18.28 | reduce_grads: 0.26 | step: 366.44 | _step_clipping: 0.15 | _step_step: 364.43 | _step_zero_grad: 0.57 | _step_check_overflow: 0.61 samples/sec: 16.165 | iteration 23740/ 143000 | elapsed time per iteration (ms): 63348.1 | learning rate: 5.648E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.330511E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 07:07:54,640] [INFO] [logging.py:60:log_dist] [Rank 0] step=23750, skipped=28, lr=[0.0005647369882772793, 0.0005647369882772793], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23750 loss: 2.3103 iter time (s): 65.517 samples/sec: 15.629 %comms: 0.002848735201563352 %optimizer_step 0.05722463202795006 %forward: 22.240630159920972 %backward: 59.57197553268377 [2025-04-11 07:07:54,641] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 45445.37 | forward: 145714.43 | backward_microstep: 390314.45 | backward: 390299.03 | backward_inner_microstep: 390278.52 | backward_inner: 390271.06 | backward_allreduce_microstep: 9.81 | backward_allreduce: 3.36 | reduce_tied_grads: 0.41 | comms: 18.66 | reduce_grads: 0.26 | step: 374.92 | _step_clipping: 0.14 | _step_step: 372.79 | _step_zero_grad: 0.67 | _step_check_overflow: 0.59 samples/sec: 15.629 | iteration 23750/ 143000 | elapsed time per iteration (ms): 65517.9 | learning rate: 5.647E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.323793E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 07:18:37,231] [INFO] [logging.py:60:log_dist] [Rank 0] step=23760, skipped=28, lr=[0.0005647059793933625, 0.0005647059793933625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23760 loss: 2.3452 iter time (s): 64.258 samples/sec: 15.936 %comms: 0.002841241271893497 %optimizer_step 0.057883412934486324 %forward: 22.697982397050502 %backward: 60.7150177616983 [2025-04-11 07:18:37,232] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32880.16 | forward: 145853.74 | backward_microstep: 390159.59 | backward: 390145.37 | backward_inner_microstep: 390126.09 | backward_inner: 390115.24 | backward_allreduce_microstep: 9.41 | backward_allreduce: 3.17 | reduce_tied_grads: 0.38 | comms: 18.26 | reduce_grads: 0.26 | step: 371.95 | _step_clipping: 0.13 | _step_step: 370.01 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 15.935 | iteration 23760/ 143000 | elapsed time per iteration (ms): 64259.1 | learning rate: 5.647E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.338179E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 07:29:18,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=23770, skipped=28, lr=[0.0005646749577335483, 0.0005646749577335483], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23770 loss: 2.3231 iter time (s): 64.097 samples/sec: 15.976 %comms: 0.0028493157922834193 %optimizer_step 0.057621592527953555 %forward: 22.748620632953294 %backward: 60.87286066150873 [2025-04-11 07:29:18,211] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31284.42 | forward: 145812.48 | backward_microstep: 390191.72 | backward: 390178.50 | backward_inner_microstep: 390158.68 | backward_inner: 390151.27 | backward_allreduce_microstep: 9.55 | backward_allreduce: 3.27 | reduce_tied_grads: 0.37 | comms: 18.26 | reduce_grads: 0.25 | step: 369.34 | _step_clipping: 0.14 | _step_step: 367.40 | _step_zero_grad: 0.62 | _step_check_overflow: 0.50 samples/sec: 15.976 | iteration 23770/ 143000 | elapsed time per iteration (ms): 64097.9 | learning rate: 5.647E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.327924E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 07:39:47,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=23780, skipped=28, lr=[0.0005646439232993338, 0.0005646439232993338], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23780 loss: 2.3319 iter time (s): 62.893 samples/sec: 16.282 %comms: 0.0028926123395046238 %optimizer_step 0.055437168526215136 %forward: 23.116042303543196 %backward: 62.038664752493865 [2025-04-11 07:39:47,147] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19703.05 | forward: 145383.91 | backward_microstep: 390193.97 | backward: 390180.27 | backward_inner_microstep: 390163.07 | backward_inner: 390156.38 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.79 | reduce_tied_grads: 0.29 | comms: 18.19 | reduce_grads: 0.21 | step: 348.66 | _step_clipping: 0.11 | _step_step: 346.95 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.281 | iteration 23780/ 143000 | elapsed time per iteration (ms): 62893.6 | learning rate: 5.646E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.334217E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 07:50:13,446] [INFO] [logging.py:60:log_dist] [Rank 0] step=23790, skipped=28, lr=[0.0005646128760922168, 0.0005646128760922168], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23790 loss: 2.3224 iter time (s): 62.629 samples/sec: 16.350 %comms: 0.0029213480207550243 %optimizer_step 0.05848017965582261 %forward: 23.180005712464983 %backward: 62.28367287047124 [2025-04-11 07:50:13,447] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17360.32 | forward: 145175.10 | backward_microstep: 390090.01 | backward: 390079.21 | backward_inner_microstep: 390062.72 | backward_inner: 390056.14 | backward_allreduce_microstep: 7.86 | backward_allreduce: 2.71 | reduce_tied_grads: 0.31 | comms: 18.30 | reduce_grads: 0.19 | step: 366.26 | _step_clipping: 0.11 | _step_step: 364.44 | _step_zero_grad: 0.48 | _step_check_overflow: 0.61 samples/sec: 16.350 | iteration 23790/ 143000 | elapsed time per iteration (ms): 62630.0 | learning rate: 5.646E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.338783E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 08:00:44,242] [INFO] [logging.py:60:log_dist] [Rank 0] step=23800, skipped=28, lr=[0.0005645818161136959, 0.0005645818161136959], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23800 loss: 2.3345 iter time (s): 63.079 samples/sec: 16.234 %comms: 0.002853814758639093 %optimizer_step 0.05921918863195016 %forward: 23.026859987791582 %backward: 61.82612373261185 [2025-04-11 08:00:44,242] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21881.57 | forward: 145250.95 | backward_microstep: 390002.76 | backward: 389992.54 | backward_inner_microstep: 389975.39 | backward_inner: 389968.42 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.20 | step: 373.55 | _step_clipping: 0.12 | _step_step: 371.77 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.233 | iteration 23800/ 143000 | elapsed time per iteration (ms): 63079.5 | learning rate: 5.646E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.331066E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 08:11:16,879] [INFO] [logging.py:60:log_dist] [Rank 0] step=23810, skipped=28, lr=[0.0005645507433652701, 0.0005645507433652701], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23810 loss: 2.3529 iter time (s): 63.263 samples/sec: 16.186 %comms: 0.0028485204437757126 %optimizer_step 0.05565943522821876 %forward: 22.975100568526642 %backward: 61.64057129578397 [2025-04-11 08:11:16,879] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23681.37 | forward: 145347.66 | backward_microstep: 389965.69 | backward: 389957.50 | backward_inner_microstep: 389941.84 | backward_inner: 389935.69 | backward_allreduce_microstep: 7.50 | backward_allreduce: 2.57 | reduce_tied_grads: 0.28 | comms: 18.02 | reduce_grads: 0.21 | step: 352.12 | _step_clipping: 0.12 | _step_step: 350.44 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.186 | iteration 23810/ 143000 | elapsed time per iteration (ms): 63263.7 | learning rate: 5.646E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.342350E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 08:19:43,301] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 262144.0, reducing to 131072.0 [2025-04-11 08:21:51,478] [INFO] [logging.py:60:log_dist] [Rank 0] step=23820, skipped=29, lr=[0.0005645227669746576, 0.0005645227669746576], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23820 loss: 2.3641 iter time (s): 63.459 samples/sec: 16.136 %comms: 0.002626840011061792 %optimizer_step 0.05340414809296234 %forward: 22.934442292577728 %backward: 61.48369395562795 [2025-04-11 08:21:51,478] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25176.13 | forward: 145540.43 | backward_microstep: 390183.68 | backward: 390171.39 | backward_inner_microstep: 390154.01 | backward_inner: 390147.15 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.86 | reduce_tied_grads: 0.35 | comms: 16.67 | reduce_grads: 0.22 | step: 338.90 | _step_clipping: 0.14 | _step_step: 336.93 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 16.136 | iteration 23820/ 143000 | elapsed time per iteration (ms): 63459.9 | learning rate: 5.645E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.338622E+00 | loss scale: 131072.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-11 08:32:30,169] [INFO] [logging.py:60:log_dist] [Rank 0] step=23830, skipped=29, lr=[0.0005644916699675448, 0.0005644916699675448], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23830 loss: 2.3453 iter time (s): 63.869 samples/sec: 16.033 %comms: 0.0028724377954161616 %optimizer_step 0.05603575601966995 %forward: 22.851000341356762 %backward: 61.10763636027534 [2025-04-11 08:32:30,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28759.36 | forward: 145945.94 | backward_microstep: 390300.23 | backward: 390285.38 | backward_inner_microstep: 390267.31 | backward_inner: 390260.27 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.97 | reduce_tied_grads: 0.30 | comms: 18.35 | reduce_grads: 0.20 | step: 357.89 | _step_clipping: 0.13 | _step_step: 355.95 | _step_zero_grad: 0.53 | _step_check_overflow: 0.66 samples/sec: 16.033 | iteration 23830/ 143000 | elapsed time per iteration (ms): 63869.1 | learning rate: 5.645E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.351718E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 08:43:21,253] [INFO] [logging.py:60:log_dist] [Rank 0] step=23840, skipped=29, lr=[0.0005644605601948779, 0.0005644605601948779], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23840 loss: 2.3226 iter time (s): 65.108 samples/sec: 15.728 %comms: 0.002836001154412056 %optimizer_step 0.05803223114762703 %forward: 22.37681865824604 %backward: 59.937286341912554 [2025-04-11 08:43:21,254] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41442.31 | forward: 145690.43 | backward_microstep: 390255.01 | backward: 390238.18 | backward_inner_microstep: 390215.69 | backward_inner: 390207.93 | backward_allreduce_microstep: 10.05 | backward_allreduce: 3.46 | reduce_tied_grads: 0.41 | comms: 18.46 | reduce_grads: 0.30 | step: 377.83 | _step_clipping: 0.16 | _step_step: 375.64 | _step_zero_grad: 0.62 | _step_check_overflow: 0.69 samples/sec: 15.728 | iteration 23840/ 143000 | elapsed time per iteration (ms): 65108.4 | learning rate: 5.645E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.339468E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 08:54:00,905] [INFO] [logging.py:60:log_dist] [Rank 0] step=23850, skipped=29, lr=[0.0005644294376581586, 0.0005644294376581586], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23850 loss: 2.3351 iter time (s): 63.965 samples/sec: 16.009 %comms: 0.0028591439517332935 %optimizer_step 0.06326775252700045 %forward: 22.756837603418315 %backward: 61.00528609113527 [2025-04-11 08:54:00,906] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30196.56 | forward: 145562.99 | backward_microstep: 390230.57 | backward: 390217.32 | backward_inner_microstep: 390195.58 | backward_inner: 390188.12 | backward_allreduce_microstep: 9.65 | backward_allreduce: 3.36 | reduce_tied_grads: 0.38 | comms: 18.29 | reduce_grads: 0.24 | step: 404.69 | _step_clipping: 0.14 | _step_step: 402.36 | _step_zero_grad: 0.71 | _step_check_overflow: 0.74 samples/sec: 16.009 | iteration 23850/ 143000 | elapsed time per iteration (ms): 63965.2 | learning rate: 5.644E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.332750E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 09:04:38,192] [INFO] [logging.py:60:log_dist] [Rank 0] step=23860, skipped=29, lr=[0.0005643983023588888, 0.0005643983023588888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23860 loss: 2.3263 iter time (s): 63.728 samples/sec: 16.068 %comms: 0.002888315490338488 %optimizer_step 0.058828435330009914 %forward: 22.86257770849748 %backward: 61.22643795039966 [2025-04-11 09:04:38,192] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27749.80 | forward: 145698.42 | backward_microstep: 390196.63 | backward: 390183.26 | backward_inner_microstep: 390160.21 | backward_inner: 390152.91 | backward_allreduce_microstep: 11.13 | backward_allreduce: 3.23 | reduce_tied_grads: 0.37 | comms: 18.41 | reduce_grads: 0.24 | step: 374.90 | _step_clipping: 0.13 | _step_step: 373.00 | _step_zero_grad: 0.58 | _step_check_overflow: 0.51 samples/sec: 16.068 | iteration 23860/ 143000 | elapsed time per iteration (ms): 63728.6 | learning rate: 5.644E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.336666E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 09:15:16,171] [INFO] [logging.py:60:log_dist] [Rank 0] step=23870, skipped=29, lr=[0.0005643671542985714, 0.0005643671542985714], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23870 loss: 2.3109 iter time (s): 63.797 samples/sec: 16.051 %comms: 0.0028279926012466524 %optimizer_step 0.055757379741204774 %forward: 22.81041219107881 %backward: 61.1549194262088 [2025-04-11 09:15:16,172] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28688.62 | forward: 145524.43 | backward_microstep: 390164.95 | backward: 390152.30 | backward_inner_microstep: 390133.81 | backward_inner: 390126.88 | backward_allreduce_microstep: 8.82 | backward_allreduce: 2.97 | reduce_tied_grads: 0.32 | comms: 18.04 | reduce_grads: 0.22 | step: 355.72 | _step_clipping: 0.13 | _step_step: 353.90 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.051 | iteration 23870/ 143000 | elapsed time per iteration (ms): 63797.9 | learning rate: 5.644E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.334734E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 09:26:03,666] [INFO] [logging.py:60:log_dist] [Rank 0] step=23880, skipped=29, lr=[0.0005643359934787096, 0.0005643359934787096], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23880 loss: 2.3416 iter time (s): 64.749 samples/sec: 15.815 %comms: 0.0028503229926893797 %optimizer_step 0.056276388621149455 %forward: 22.54086852007505 %backward: 60.27857626319383 [2025-04-11 09:26:03,667] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37625.05 | forward: 145949.47 | backward_microstep: 390311.74 | backward: 390296.68 | backward_inner_microstep: 390277.29 | backward_inner: 390270.06 | backward_allreduce_microstep: 9.16 | backward_allreduce: 3.13 | reduce_tied_grads: 0.36 | comms: 18.46 | reduce_grads: 0.22 | step: 364.38 | _step_clipping: 0.13 | _step_step: 362.46 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 15.815 | iteration 23880/ 143000 | elapsed time per iteration (ms): 64749.5 | learning rate: 5.643E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.330421E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 09:36:42,410] [INFO] [logging.py:60:log_dist] [Rank 0] step=23890, skipped=29, lr=[0.0005643048199008075, 0.0005643048199008075], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23890 loss: 2.3121 iter time (s): 63.874 samples/sec: 16.032 %comms: 0.0028437929691305904 %optimizer_step 0.05599790311150361 %forward: 22.83794931369015 %backward: 61.100290303906036 [2025-04-11 09:36:42,411] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28989.82 | forward: 145874.74 | backward_microstep: 390285.56 | backward: 390270.99 | backward_inner_microstep: 390251.65 | backward_inner: 390244.58 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.13 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.22 | step: 357.68 | _step_clipping: 0.16 | _step_step: 355.82 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 16.031 | iteration 23890/ 143000 | elapsed time per iteration (ms): 63874.4 | learning rate: 5.643E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.328128E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 09:47:16,118] [INFO] [logging.py:60:log_dist] [Rank 0] step=23900, skipped=29, lr=[0.0005642736335663695, 0.0005642736335663695], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23900 loss: 2.3318 iter time (s): 63.370 samples/sec: 16.159 %comms: 0.0032338576256643716 %optimizer_step 0.05896095907679739 %forward: 23.00915607940385 %backward: 61.647053087906954 [2025-04-11 09:47:16,119] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23580.35 | forward: 145809.55 | backward_microstep: 390679.04 | backward: 390658.80 | backward_inner_microstep: 390639.99 | backward_inner: 390632.67 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.97 | reduce_tied_grads: 0.33 | comms: 20.49 | reduce_grads: 0.23 | step: 373.64 | _step_clipping: 0.13 | _step_step: 371.45 | _step_zero_grad: 0.62 | _step_check_overflow: 0.71 samples/sec: 16.159 | iteration 23900/ 143000 | elapsed time per iteration (ms): 63370.9 | learning rate: 5.643E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.337488E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 09:58:04,003] [INFO] [logging.py:60:log_dist] [Rank 0] step=23910, skipped=29, lr=[0.000564242434476901, 0.000564242434476901], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23910 loss: 2.3550 iter time (s): 64.788 samples/sec: 15.805 %comms: 0.002834667378742786 %optimizer_step 0.059909814977673295 %forward: 22.567213808598233 %backward: 60.30301141046669 [2025-04-11 09:58:04,003] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37292.24 | forward: 146207.68 | backward_microstep: 390709.24 | backward: 390689.06 | backward_inner_microstep: 390668.68 | backward_inner: 390660.91 | backward_allreduce_microstep: 9.67 | backward_allreduce: 3.49 | reduce_tied_grads: 0.38 | comms: 18.37 | reduce_grads: 0.26 | step: 388.14 | _step_clipping: 0.15 | _step_step: 386.03 | _step_zero_grad: 0.65 | _step_check_overflow: 0.55 samples/sec: 15.805 | iteration 23910/ 143000 | elapsed time per iteration (ms): 64788.4 | learning rate: 5.642E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.330892E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 10:08:55,195] [INFO] [logging.py:60:log_dist] [Rank 0] step=23920, skipped=29, lr=[0.0005642112226339077, 0.0005642112226339077], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23920 loss: 2.3323 iter time (s): 65.119 samples/sec: 15.725 %comms: 0.0028066028724398535 %optimizer_step 0.057183032394703265 %forward: 22.38196815620092 %backward: 59.922025852993656 [2025-04-11 10:08:55,196] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41607.81 | forward: 145748.32 | backward_microstep: 390216.53 | backward: 390204.05 | backward_inner_microstep: 390183.24 | backward_inner: 390175.84 | backward_allreduce_microstep: 10.07 | backward_allreduce: 3.47 | reduce_tied_grads: 0.38 | comms: 18.28 | reduce_grads: 0.24 | step: 372.37 | _step_clipping: 0.15 | _step_step: 370.21 | _step_zero_grad: 0.62 | _step_check_overflow: 0.71 samples/sec: 15.725 | iteration 23920/ 143000 | elapsed time per iteration (ms): 65119.3 | learning rate: 5.642E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.336345E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 10:19:51,885] [INFO] [logging.py:60:log_dist] [Rank 0] step=23930, skipped=29, lr=[0.000564179998038896, 0.000564179998038896], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23930 loss: 2.3399 iter time (s): 65.668 samples/sec: 15.594 %comms: 0.0027802792654699543 %optimizer_step 0.05568702079207621 %forward: 22.283037221683028 %backward: 59.438906728753985 [2025-04-11 10:19:51,885] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 46425.84 | forward: 146328.89 | backward_microstep: 390338.82 | backward: 390325.12 | backward_inner_microstep: 390305.68 | backward_inner: 390298.37 | backward_allreduce_microstep: 9.31 | backward_allreduce: 3.19 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.23 | step: 365.69 | _step_clipping: 0.13 | _step_step: 363.85 | _step_zero_grad: 0.58 | _step_check_overflow: 0.48 samples/sec: 15.593 | iteration 23930/ 143000 | elapsed time per iteration (ms): 65668.9 | learning rate: 5.642E-04 | approx flops per GPU: 67.3TFLOPS | lm_loss: 2.335690E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 10:30:28,068] [INFO] [logging.py:60:log_dist] [Rank 0] step=23940, skipped=29, lr=[0.0005641487606933731, 0.0005641487606933731], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23940 loss: 2.3387 iter time (s): 63.618 samples/sec: 16.096 %comms: 0.0028739801979470155 %optimizer_step 0.057705101616314404 %forward: 22.872990976413863 %backward: 61.344119737632575 [2025-04-11 10:30:28,069] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26818.07 | forward: 145512.74 | backward_microstep: 390270.14 | backward: 390257.28 | backward_inner_microstep: 390238.59 | backward_inner: 390231.33 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.23 | reduce_tied_grads: 0.39 | comms: 18.28 | reduce_grads: 0.26 | step: 367.11 | _step_clipping: 0.14 | _step_step: 365.03 | _step_zero_grad: 0.58 | _step_check_overflow: 0.68 samples/sec: 16.096 | iteration 23940/ 143000 | elapsed time per iteration (ms): 63618.4 | learning rate: 5.641E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.333663E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 10:41:15,224] [INFO] [logging.py:60:log_dist] [Rank 0] step=23950, skipped=29, lr=[0.0005641175105988465, 0.0005641175105988465], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23950 loss: 2.3412 iter time (s): 64.715 samples/sec: 15.823 %comms: 0.0028204619624690114 %optimizer_step 0.05714131338572598 %forward: 22.52255704583907 %backward: 60.31220896958408 [2025-04-11 10:41:15,225] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37469.67 | forward: 145754.66 | backward_microstep: 390324.73 | backward: 390310.28 | backward_inner_microstep: 390291.43 | backward_inner: 390284.09 | backward_allreduce_microstep: 8.97 | backward_allreduce: 3.08 | reduce_tied_grads: 0.33 | comms: 18.25 | reduce_grads: 0.23 | step: 369.79 | _step_clipping: 0.12 | _step_step: 367.78 | _step_zero_grad: 0.57 | _step_check_overflow: 0.68 samples/sec: 15.823 | iteration 23950/ 143000 | elapsed time per iteration (ms): 64715.6 | learning rate: 5.641E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.343274E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 10:52:01,693] [INFO] [logging.py:60:log_dist] [Rank 0] step=23960, skipped=29, lr=[0.0005640862477568244, 0.0005640862477568244], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23960 loss: 2.3123 iter time (s): 64.646 samples/sec: 15.840 %comms: 0.0028226505453448998 %optimizer_step 0.05808431003699136 %forward: 22.5611757697794 %backward: 60.37453349010534 [2025-04-11 10:52:01,693] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36686.86 | forward: 145849.45 | backward_microstep: 390313.05 | backward: 390298.48 | backward_inner_microstep: 390278.79 | backward_inner: 390271.20 | backward_allreduce_microstep: 9.29 | backward_allreduce: 3.19 | reduce_tied_grads: 0.39 | comms: 18.25 | reduce_grads: 0.26 | step: 375.49 | _step_clipping: 0.15 | _step_step: 373.42 | _step_zero_grad: 0.63 | _step_check_overflow: 0.60 samples/sec: 15.840 | iteration 23960/ 143000 | elapsed time per iteration (ms): 64646.9 | learning rate: 5.641E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.328119E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 11:02:57,974] [INFO] [logging.py:60:log_dist] [Rank 0] step=23970, skipped=29, lr=[0.0005640549721688157, 0.0005640549721688157], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23970 loss: 2.3255 iter time (s): 65.627 samples/sec: 15.603 %comms: 0.002793233586125314 %optimizer_step 0.0569611921918967 %forward: 22.346214105847192 %backward: 59.48428223171468 [2025-04-11 11:02:57,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 45629.02 | forward: 146652.58 | backward_microstep: 390397.07 | backward: 390380.38 | backward_inner_microstep: 390360.37 | backward_inner: 390352.75 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.21 | reduce_tied_grads: 0.34 | comms: 18.33 | reduce_grads: 0.24 | step: 373.82 | _step_clipping: 0.14 | _step_step: 371.59 | _step_zero_grad: 0.63 | _step_check_overflow: 0.79 samples/sec: 15.603 | iteration 23970/ 143000 | elapsed time per iteration (ms): 65628.1 | learning rate: 5.641E-04 | approx flops per GPU: 67.3TFLOPS | lm_loss: 2.328659E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 11:13:40,772] [INFO] [logging.py:60:log_dist] [Rank 0] step=23980, skipped=29, lr=[0.0005640236838363302, 0.0005640236838363302], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23980 loss: 2.3420 iter time (s): 64.279 samples/sec: 15.931 %comms: 0.0028426241018072933 %optimizer_step 0.05621829856698916 %forward: 22.706231266780673 %backward: 60.71924404466996 [2025-04-11 11:13:40,773] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32962.34 | forward: 145953.85 | backward_microstep: 390312.82 | backward: 390298.47 | backward_inner_microstep: 390279.04 | backward_inner: 390271.86 | backward_allreduce_microstep: 9.33 | backward_allreduce: 3.21 | reduce_tied_grads: 0.34 | comms: 18.27 | reduce_grads: 0.23 | step: 361.37 | _step_clipping: 0.13 | _step_step: 359.58 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 15.930 | iteration 23980/ 143000 | elapsed time per iteration (ms): 64279.8 | learning rate: 5.640E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.328019E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 11:24:28,509] [INFO] [logging.py:60:log_dist] [Rank 0] step=23990, skipped=29, lr=[0.0005639923827608777, 0.0005639923827608777], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 23990 loss: 2.3467 iter time (s): 64.773 samples/sec: 15.809 %comms: 0.0028316265076884528 %optimizer_step 0.05818193917006509 %forward: 22.554379482434136 %backward: 60.25668739750459 [2025-04-11 11:24:28,509] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37720.68 | forward: 146091.55 | backward_microstep: 390315.80 | backward: 390300.82 | backward_inner_microstep: 390281.80 | backward_inner: 390274.68 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.11 | reduce_tied_grads: 0.35 | comms: 18.34 | reduce_grads: 0.23 | step: 376.86 | _step_clipping: 0.13 | _step_step: 374.91 | _step_zero_grad: 0.58 | _step_check_overflow: 0.60 samples/sec: 15.809 | iteration 23990/ 143000 | elapsed time per iteration (ms): 64773.6 | learning rate: 5.640E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.326858E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 11:35:06,422] [INFO] [logging.py:60:log_dist] [Rank 0] step=24000, skipped=29, lr=[0.000563961068943969, 0.000563961068943969], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24000 loss: 2.3340 iter time (s): 63.791 samples/sec: 16.052 %comms: 0.0028933565132200504 %optimizer_step 0.06022579850058943 %forward: 22.854160655729476 %backward: 61.185217602747024 [2025-04-11 11:35:06,423] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28184.24 | forward: 145788.39 | backward_microstep: 390319.72 | backward: 390305.04 | backward_inner_microstep: 390285.71 | backward_inner: 390278.41 | backward_allreduce_microstep: 9.29 | backward_allreduce: 3.17 | reduce_tied_grads: 0.39 | comms: 18.46 | reduce_grads: 0.24 | step: 384.18 | _step_clipping: 0.14 | _step_step: 382.17 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.052 | iteration 24000/ 143000 | elapsed time per iteration (ms): 63791.4 | learning rate: 5.640E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.344621E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 11:35:09,268] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step24000/mp_rank_00_model_states.pt [2025-04-11 11:35:23,705] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-11 11:35:23,710] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step24000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-11 11:46:01,389] [INFO] [logging.py:60:log_dist] [Rank 0] step=24010, skipped=29, lr=[0.0005639297423871156, 0.0005639297423871156], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24010 loss: 2.3159 iter time (s): 63.766 samples/sec: 16.059 %comms: 0.002908107022361518 %optimizer_step 0.0577711973002908 %forward: 22.867240638955895 %backward: 61.20988850309676 [2025-04-11 11:46:01,390] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27913.67 | forward: 145816.22 | backward_microstep: 390327.79 | backward: 390313.57 | backward_inner_microstep: 390294.59 | backward_inner: 390287.33 | backward_allreduce_microstep: 9.10 | backward_allreduce: 3.15 | reduce_tied_grads: 0.36 | comms: 18.54 | reduce_grads: 0.23 | step: 368.39 | _step_clipping: 0.13 | _step_step: 366.08 | _step_zero_grad: 0.59 | _step_check_overflow: 0.84 samples/sec: 15.634 | iteration 24010/ 143000 | elapsed time per iteration (ms): 65496.7 | learning rate: 5.639E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.335930E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 11:56:52,137] [INFO] [logging.py:60:log_dist] [Rank 0] step=24020, skipped=29, lr=[0.0005638984030918292, 0.0005638984030918292], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24020 loss: 2.3313 iter time (s): 65.074 samples/sec: 15.736 %comms: 0.0028235429493708403 %optimizer_step 0.05766228962051817 %forward: 22.467842068450025 %backward: 59.968460622970454 [2025-04-11 11:56:52,138] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40697.50 | forward: 146207.58 | backward_microstep: 390253.42 | backward: 390239.67 | backward_inner_microstep: 390218.33 | backward_inner: 390210.92 | backward_allreduce_microstep: 9.42 | backward_allreduce: 3.23 | reduce_tied_grads: 0.39 | comms: 18.37 | reduce_grads: 0.27 | step: 375.23 | _step_clipping: 0.16 | _step_step: 372.90 | _step_zero_grad: 0.65 | _step_check_overflow: 0.81 samples/sec: 15.736 | iteration 24020/ 143000 | elapsed time per iteration (ms): 65074.8 | learning rate: 5.639E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.324809E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 12:07:43,585] [INFO] [logging.py:60:log_dist] [Rank 0] step=24030, skipped=29, lr=[0.0005638670510596225, 0.0005638670510596225], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24030 loss: 2.3179 iter time (s): 65.144 samples/sec: 15.719 %comms: 0.0028003453801024583 %optimizer_step 0.056670916832019747 %forward: 22.379781433597554 %backward: 59.901160518792054 [2025-04-11 12:07:43,585] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41843.33 | forward: 145791.07 | backward_microstep: 390234.57 | backward: 390220.71 | backward_inner_microstep: 390201.09 | backward_inner: 390193.64 | backward_allreduce_microstep: 9.43 | backward_allreduce: 3.23 | reduce_tied_grads: 0.35 | comms: 18.24 | reduce_grads: 0.27 | step: 369.18 | _step_clipping: 0.14 | _step_step: 367.25 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 15.719 | iteration 24030/ 143000 | elapsed time per iteration (ms): 65144.7 | learning rate: 5.639E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.326730E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 12:18:33,080] [INFO] [logging.py:60:log_dist] [Rank 0] step=24040, skipped=29, lr=[0.0005638356862920088, 0.0005638356862920088], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24040 loss: 2.3540 iter time (s): 64.949 samples/sec: 15.766 %comms: 0.002804358655471062 %optimizer_step 0.05607777569053518 %forward: 22.47616730232331 %backward: 60.09524428673172 [2025-04-11 12:18:33,080] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39606.14 | forward: 145980.12 | backward_microstep: 390326.05 | backward: 390311.71 | backward_inner_microstep: 390290.62 | backward_inner: 390283.34 | backward_allreduce_microstep: 9.36 | backward_allreduce: 3.21 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.22 | step: 364.22 | _step_clipping: 0.12 | _step_step: 362.21 | _step_zero_grad: 0.58 | _step_check_overflow: 0.64 samples/sec: 15.766 | iteration 24040/ 143000 | elapsed time per iteration (ms): 64949.5 | learning rate: 5.638E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.335992E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 12:29:08,152] [INFO] [logging.py:60:log_dist] [Rank 0] step=24050, skipped=29, lr=[0.0005638043087905017, 0.0005638043087905017], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24050 loss: 2.3321 iter time (s): 63.506 samples/sec: 16.124 %comms: 0.002830229149887745 %optimizer_step 0.0570374124228422 %forward: 22.939484614870096 %backward: 61.47875857347231 [2025-04-11 12:29:08,153] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25340.26 | forward: 145679.56 | backward_microstep: 390445.85 | backward: 390427.18 | backward_inner_microstep: 390409.35 | backward_inner: 390402.39 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 362.22 | _step_clipping: 0.14 | _step_step: 360.34 | _step_zero_grad: 0.51 | _step_check_overflow: 0.63 samples/sec: 16.124 | iteration 24050/ 143000 | elapsed time per iteration (ms): 63507.3 | learning rate: 5.638E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.339970E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 12:39:57,617] [INFO] [logging.py:60:log_dist] [Rank 0] step=24060, skipped=29, lr=[0.0005637729185566159, 0.0005637729185566159], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24060 loss: 2.3333 iter time (s): 64.946 samples/sec: 15.767 %comms: 0.0027715926828406935 %optimizer_step 0.05579227560889442 %forward: 22.461207180127893 %backward: 60.07394141147779 [2025-04-11 12:39:57,618] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39833.95 | forward: 145876.38 | backward_microstep: 390167.86 | backward: 390155.75 | backward_inner_microstep: 390136.54 | backward_inner: 390129.50 | backward_allreduce_microstep: 9.42 | backward_allreduce: 3.22 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.22 | step: 362.35 | _step_clipping: 0.14 | _step_step: 360.42 | _step_zero_grad: 0.52 | _step_check_overflow: 0.66 samples/sec: 15.767 | iteration 24060/ 143000 | elapsed time per iteration (ms): 64946.5 | learning rate: 5.638E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.335651E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 12:50:46,537] [INFO] [logging.py:60:log_dist] [Rank 0] step=24070, skipped=29, lr=[0.0005637415155918661, 0.0005637415155918661], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24070 loss: 2.3316 iter time (s): 64.891 samples/sec: 15.780 %comms: 0.0028145614582209665 %optimizer_step 0.05730826298368719 %forward: 22.482735482587156 %backward: 60.14946498899379 [2025-04-11 12:50:46,537] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39109.19 | forward: 145893.39 | backward_microstep: 390331.23 | backward: 390317.70 | backward_inner_microstep: 390299.03 | backward_inner: 390292.02 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.06 | reduce_tied_grads: 0.34 | comms: 18.26 | reduce_grads: 0.24 | step: 371.88 | _step_clipping: 0.13 | _step_step: 370.00 | _step_zero_grad: 0.56 | _step_check_overflow: 0.51 samples/sec: 15.780 | iteration 24070/ 143000 | elapsed time per iteration (ms): 64891.9 | learning rate: 5.637E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.335804E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 13:01:21,204] [INFO] [logging.py:60:log_dist] [Rank 0] step=24080, skipped=29, lr=[0.000563710099897768, 0.000563710099897768], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24080 loss: 2.3453 iter time (s): 63.466 samples/sec: 16.135 %comms: 0.002889635324565943 %optimizer_step 0.05869846691188813 %forward: 22.92492449586619 %backward: 61.464576628071256 [2025-04-11 13:01:21,205] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25506.68 | forward: 145495.61 | backward_microstep: 390102.29 | backward: 390091.85 | backward_inner_microstep: 390072.81 | backward_inner: 390065.91 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.18 | reduce_tied_grads: 0.34 | comms: 18.34 | reduce_grads: 0.23 | step: 372.54 | _step_clipping: 0.13 | _step_step: 370.57 | _step_zero_grad: 0.60 | _step_check_overflow: 0.58 samples/sec: 16.134 | iteration 24080/ 143000 | elapsed time per iteration (ms): 63466.8 | learning rate: 5.637E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.342722E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 13:12:09,423] [INFO] [logging.py:60:log_dist] [Rank 0] step=24090, skipped=29, lr=[0.0005636786714758382, 0.0005636786714758382], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24090 loss: 2.3413 iter time (s): 64.821 samples/sec: 15.797 %comms: 0.0027957903568472334 %optimizer_step 0.057163967240081166 %forward: 22.51936100270098 %backward: 60.19898106204467 [2025-04-11 13:12:09,424] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38425.73 | forward: 145973.40 | backward_microstep: 390230.78 | backward: 390217.55 | backward_inner_microstep: 390198.19 | backward_inner: 390190.99 | backward_allreduce_microstep: 9.39 | backward_allreduce: 3.21 | reduce_tied_grads: 0.31 | comms: 18.12 | reduce_grads: 0.24 | step: 370.54 | _step_clipping: 0.14 | _step_step: 368.61 | _step_zero_grad: 0.60 | _step_check_overflow: 0.55 samples/sec: 15.797 | iteration 24090/ 143000 | elapsed time per iteration (ms): 64821.9 | learning rate: 5.637E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.341755E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 13:22:58,886] [INFO] [logging.py:60:log_dist] [Rank 0] step=24100, skipped=29, lr=[0.0005636472303275933, 0.0005636472303275933], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24100 loss: 2.3495 iter time (s): 64.946 samples/sec: 15.767 %comms: 0.0028096371351822745 %optimizer_step 0.055838596633963594 %forward: 22.43465612395709 %backward: 60.09230987092261 [2025-04-11 13:22:58,887] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39923.57 | forward: 145703.29 | backward_microstep: 390289.51 | backward: 390273.30 | backward_inner_microstep: 390251.71 | backward_inner: 390244.45 | backward_allreduce_microstep: 11.31 | backward_allreduce: 3.30 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.23 | step: 362.65 | _step_clipping: 0.16 | _step_step: 360.66 | _step_zero_grad: 0.61 | _step_check_overflow: 0.55 samples/sec: 15.767 | iteration 24100/ 143000 | elapsed time per iteration (ms): 64946.3 | learning rate: 5.636E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.337050E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 13:33:37,864] [INFO] [logging.py:60:log_dist] [Rank 0] step=24110, skipped=29, lr=[0.0005636157764545507, 0.0005636157764545507], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24110 loss: 2.3415 iter time (s): 63.897 samples/sec: 16.026 %comms: 0.002865291217505857 %optimizer_step 0.057485896173842366 %forward: 22.814357913359036 %backward: 61.119765538993086 [2025-04-11 13:33:37,864] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28993.41 | forward: 145777.30 | backward_microstep: 390557.16 | backward: 390538.03 | backward_inner_microstep: 390517.71 | backward_inner: 390510.08 | backward_allreduce_microstep: 9.79 | backward_allreduce: 3.35 | reduce_tied_grads: 0.38 | comms: 18.31 | reduce_grads: 0.23 | step: 367.32 | _step_clipping: 0.14 | _step_step: 365.27 | _step_zero_grad: 0.58 | _step_check_overflow: 0.65 samples/sec: 16.026 | iteration 24110/ 143000 | elapsed time per iteration (ms): 63897.8 | learning rate: 5.636E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.338885E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 13:44:11,184] [INFO] [logging.py:60:log_dist] [Rank 0] step=24120, skipped=29, lr=[0.0005635843098582288, 0.0005635843098582288], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24120 loss: 2.3250 iter time (s): 63.331 samples/sec: 16.169 %comms: 0.0029400127177685973 %optimizer_step 0.05783387801772187 %forward: 23.01793009461709 %backward: 61.64486159744064 [2025-04-11 13:44:11,185] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23505.42 | forward: 145775.97 | backward_microstep: 390422.43 | backward: 390406.07 | backward_inner_microstep: 390383.68 | backward_inner: 390376.28 | backward_allreduce_microstep: 9.09 | backward_allreduce: 3.15 | reduce_tied_grads: 0.33 | comms: 18.62 | reduce_grads: 0.24 | step: 366.27 | _step_clipping: 0.12 | _step_step: 364.42 | _step_zero_grad: 0.51 | _step_check_overflow: 0.59 samples/sec: 16.169 | iteration 24120/ 143000 | elapsed time per iteration (ms): 63332.1 | learning rate: 5.636E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.327775E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 13:54:53,046] [INFO] [logging.py:60:log_dist] [Rank 0] step=24130, skipped=29, lr=[0.000563552830540146, 0.000563552830540146], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24130 loss: 2.3336 iter time (s): 64.186 samples/sec: 15.954 %comms: 0.002886143666786992 %optimizer_step 0.058020183390038636 %forward: 22.678348595805396 %backward: 60.79869583971478 [2025-04-11 13:54:53,047] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32433.82 | forward: 145562.33 | backward_microstep: 390255.55 | backward: 390240.05 | backward_inner_microstep: 390221.51 | backward_inner: 390214.43 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.00 | reduce_tied_grads: 0.33 | comms: 18.52 | reduce_grads: 0.22 | step: 372.41 | _step_clipping: 0.14 | _step_step: 370.34 | _step_zero_grad: 0.58 | _step_check_overflow: 0.66 samples/sec: 15.954 | iteration 24130/ 143000 | elapsed time per iteration (ms): 64186.2 | learning rate: 5.636E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.328609E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 14:05:37,665] [INFO] [logging.py:60:log_dist] [Rank 0] step=24140, skipped=29, lr=[0.0005635213385018219, 0.0005635213385018219], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24140 loss: 2.3317 iter time (s): 64.461 samples/sec: 15.886 %comms: 0.002913043056488102 %optimizer_step 0.05921913985691479 %forward: 22.690014961723495 %backward: 60.58953657924809 [2025-04-11 14:05:37,666] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34118.37 | forward: 146262.73 | backward_microstep: 390584.99 | backward: 390567.88 | backward_inner_microstep: 390548.21 | backward_inner: 390540.64 | backward_allreduce_microstep: 9.32 | backward_allreduce: 3.34 | reduce_tied_grads: 0.38 | comms: 18.78 | reduce_grads: 0.22 | step: 381.73 | _step_clipping: 0.14 | _step_step: 379.46 | _step_zero_grad: 0.60 | _step_check_overflow: 0.82 samples/sec: 15.885 | iteration 24140/ 143000 | elapsed time per iteration (ms): 64461.9 | learning rate: 5.635E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.332177E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 14:16:16,152] [INFO] [logging.py:60:log_dist] [Rank 0] step=24150, skipped=29, lr=[0.0005634898337447764, 0.0005634898337447764], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24150 loss: 2.3279 iter time (s): 63.848 samples/sec: 16.038 %comms: 0.00284438189914404 %optimizer_step 0.05667393219061066 %forward: 22.789566382097533 %backward: 61.09328470306112 [2025-04-11 14:16:16,153] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29342.33 | forward: 145506.91 | backward_microstep: 390081.06 | backward: 390068.63 | backward_inner_microstep: 390049.07 | backward_inner: 390041.93 | backward_allreduce_microstep: 9.61 | backward_allreduce: 3.43 | reduce_tied_grads: 0.31 | comms: 18.16 | reduce_grads: 0.21 | step: 361.85 | _step_clipping: 0.14 | _step_step: 359.94 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.038 | iteration 24150/ 143000 | elapsed time per iteration (ms): 63848.7 | learning rate: 5.635E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.329410E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 14:27:03,040] [INFO] [logging.py:60:log_dist] [Rank 0] step=24160, skipped=29, lr=[0.0005634583162705298, 0.0005634583162705298], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24160 loss: 2.3482 iter time (s): 64.688 samples/sec: 15.830 %comms: 0.00281131041058707 %optimizer_step 0.05659473729384295 %forward: 22.54823812013904 %backward: 60.33388816660039 [2025-04-11 14:27:03,041] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37116.66 | forward: 145860.44 | backward_microstep: 390304.18 | backward: 390288.91 | backward_inner_microstep: 390268.83 | backward_inner: 390261.52 | backward_allreduce_microstep: 9.58 | backward_allreduce: 3.32 | reduce_tied_grads: 0.36 | comms: 18.19 | reduce_grads: 0.25 | step: 366.10 | _step_clipping: 0.14 | _step_step: 364.16 | _step_zero_grad: 0.56 | _step_check_overflow: 0.59 samples/sec: 15.830 | iteration 24160/ 143000 | elapsed time per iteration (ms): 64688.8 | learning rate: 5.635E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.333657E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 14:37:55,766] [INFO] [logging.py:60:log_dist] [Rank 0] step=24170, skipped=29, lr=[0.0005634267860806036, 0.0005634267860806036], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24170 loss: 2.3273 iter time (s): 65.272 samples/sec: 15.688 %comms: 0.002934252168807338 %optimizer_step 0.06412512227790303 %forward: 22.462099848254955 %backward: 59.859558650142354 [2025-04-11 14:37:55,767] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41679.45 | forward: 146614.25 | backward_microstep: 390735.64 | backward: 390714.33 | backward_inner_microstep: 390689.75 | backward_inner: 390681.66 | backward_allreduce_microstep: 11.65 | backward_allreduce: 3.40 | reduce_tied_grads: 0.46 | comms: 19.15 | reduce_grads: 0.32 | step: 418.56 | _step_clipping: 0.18 | _step_step: 415.78 | _step_zero_grad: 0.77 | _step_check_overflow: 0.91 samples/sec: 15.688 | iteration 24170/ 143000 | elapsed time per iteration (ms): 65272.6 | learning rate: 5.634E-04 | approx flops per GPU: 67.7TFLOPS | lm_loss: 2.336261E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 14:48:41,355] [INFO] [logging.py:60:log_dist] [Rank 0] step=24180, skipped=29, lr=[0.0005633952431765195, 0.0005633952431765195], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24180 loss: 2.3291 iter time (s): 64.558 samples/sec: 15.862 %comms: 0.00287886717096439 %optimizer_step 0.05835281649737216 %forward: 22.603892638273237 %backward: 60.429126496854394 [2025-04-11 14:48:41,356] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35940.80 | forward: 145926.62 | backward_microstep: 390134.56 | backward: 390119.46 | backward_inner_microstep: 390099.52 | backward_inner: 390092.12 | backward_allreduce_microstep: 9.57 | backward_allreduce: 3.31 | reduce_tied_grads: 0.40 | comms: 18.59 | reduce_grads: 0.26 | step: 376.72 | _step_clipping: 0.16 | _step_step: 374.41 | _step_zero_grad: 0.65 | _step_check_overflow: 0.77 samples/sec: 15.861 | iteration 24180/ 143000 | elapsed time per iteration (ms): 64558.9 | learning rate: 5.634E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.333701E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 14:59:31,862] [INFO] [logging.py:60:log_dist] [Rank 0] step=24190, skipped=29, lr=[0.0005633636875597998, 0.0005633636875597998], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24190 loss: 2.3273 iter time (s): 65.050 samples/sec: 15.742 %comms: 0.0028210021293614443 %optimizer_step 0.05791246002961362 %forward: 22.452937059072518 %backward: 59.96819781909597 [2025-04-11 14:59:31,862] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40782.23 | forward: 146056.22 | backward_microstep: 390105.22 | backward: 390092.75 | backward_inner_microstep: 390074.21 | backward_inner: 390067.31 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.03 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.23 | step: 376.72 | _step_clipping: 0.14 | _step_step: 374.60 | _step_zero_grad: 0.63 | _step_check_overflow: 0.67 samples/sec: 15.742 | iteration 24190/ 143000 | elapsed time per iteration (ms): 65050.6 | learning rate: 5.634E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.342267E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 15:10:28,272] [INFO] [logging.py:60:log_dist] [Rank 0] step=24200, skipped=29, lr=[0.0005633321192319676, 0.0005633321192319676], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24200 loss: 2.3517 iter time (s): 65.640 samples/sec: 15.600 %comms: 0.0027663865088574484 %optimizer_step 0.05512341936858939 %forward: 22.276723798187053 %backward: 59.44698850931459 [2025-04-11 15:10:28,273] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 46389.23 | forward: 146225.33 | backward_microstep: 390229.57 | backward: 390212.47 | backward_inner_microstep: 390192.02 | backward_inner: 390184.59 | backward_allreduce_microstep: 9.80 | backward_allreduce: 3.37 | reduce_tied_grads: 0.34 | comms: 18.16 | reduce_grads: 0.23 | step: 361.83 | _step_clipping: 0.13 | _step_step: 359.95 | _step_zero_grad: 0.59 | _step_check_overflow: 0.52 samples/sec: 15.600 | iteration 24200/ 143000 | elapsed time per iteration (ms): 65641.0 | learning rate: 5.633E-04 | approx flops per GPU: 67.3TFLOPS | lm_loss: 2.332498E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 15:21:11,278] [INFO] [logging.py:60:log_dist] [Rank 0] step=24210, skipped=29, lr=[0.0005633005381945464, 0.0005633005381945464], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24210 loss: 2.3222 iter time (s): 64.300 samples/sec: 15.925 %comms: 0.0028351084716706034 %optimizer_step 0.05585268623170174 %forward: 23.221721973804975 %backward: 60.69449854003241 [2025-04-11 15:21:11,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29822.79 | forward: 149315.47 | backward_microstep: 390280.08 | backward: 390265.10 | backward_inner_microstep: 390246.81 | backward_inner: 390239.91 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.23 | reduce_grads: 0.23 | step: 359.13 | _step_clipping: 0.14 | _step_step: 357.23 | _step_zero_grad: 0.55 | _step_check_overflow: 0.57 samples/sec: 15.925 | iteration 24210/ 143000 | elapsed time per iteration (ms): 64300.5 | learning rate: 5.633E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.334779E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 15:31:54,804] [INFO] [logging.py:60:log_dist] [Rank 0] step=24220, skipped=29, lr=[0.0005632689444490607, 0.0005632689444490607], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24220 loss: 2.3222 iter time (s): 64.352 samples/sec: 15.912 %comms: 0.0028070628035713695 %optimizer_step 0.059775143843814625 %forward: 22.622838590632178 %backward: 60.61168201462445 [2025-04-11 15:31:54,805] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34308.34 | forward: 145582.58 | backward_microstep: 390059.67 | backward: 390048.54 | backward_inner_microstep: 390028.89 | backward_inner: 390021.80 | backward_allreduce_microstep: 9.64 | backward_allreduce: 3.31 | reduce_tied_grads: 0.34 | comms: 18.06 | reduce_grads: 0.23 | step: 384.67 | _step_clipping: 0.14 | _step_step: 382.71 | _step_zero_grad: 0.58 | _step_check_overflow: 0.60 samples/sec: 15.912 | iteration 24220/ 143000 | elapsed time per iteration (ms): 64352.7 | learning rate: 5.633E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.333212E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 15:42:41,561] [INFO] [logging.py:60:log_dist] [Rank 0] step=24230, skipped=29, lr=[0.0005632373379970352, 0.0005632373379970352], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24230 loss: 2.3213 iter time (s): 64.675 samples/sec: 15.833 %comms: 0.0028118381593677036 %optimizer_step 0.05461294102727766 %forward: 22.494694400062805 %backward: 60.310085188805616 [2025-04-11 15:42:41,562] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37651.10 | forward: 145484.85 | backward_microstep: 390067.09 | backward: 390056.60 | backward_inner_microstep: 390038.85 | backward_inner: 390032.00 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.19 | reduce_grads: 0.21 | step: 353.21 | _step_clipping: 0.13 | _step_step: 351.42 | _step_zero_grad: 0.49 | _step_check_overflow: 0.55 samples/sec: 15.833 | iteration 24230/ 143000 | elapsed time per iteration (ms): 64675.8 | learning rate: 5.632E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.329765E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 15:53:24,263] [INFO] [logging.py:60:log_dist] [Rank 0] step=24240, skipped=29, lr=[0.0005632057188399953, 0.0005632057188399953], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24240 loss: 2.3193 iter time (s): 64.269 samples/sec: 15.933 %comms: 0.0028168630525619623 %optimizer_step 0.05587388628151395 %forward: 22.637420533067914 %backward: 60.7018852748062 [2025-04-11 15:53:24,263] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33492.01 | forward: 145489.57 | backward_microstep: 390140.15 | backward: 390127.98 | backward_inner_microstep: 390105.62 | backward_inner: 390096.57 | backward_allreduce_microstep: 12.56 | backward_allreduce: 3.15 | reduce_tied_grads: 0.30 | comms: 18.10 | reduce_grads: 0.21 | step: 359.10 | _step_clipping: 0.11 | _step_step: 357.40 | _step_zero_grad: 0.51 | _step_check_overflow: 0.47 samples/sec: 15.933 | iteration 24240/ 143000 | elapsed time per iteration (ms): 64270.1 | learning rate: 5.632E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.325855E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 16:04:07,218] [INFO] [logging.py:60:log_dist] [Rank 0] step=24250, skipped=29, lr=[0.0005631740869794672, 0.0005631740869794672], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24250 loss: 2.3448 iter time (s): 64.295 samples/sec: 15.927 %comms: 0.002840593034215462 %optimizer_step 0.05655039544816635 %forward: 22.688304785991992 %backward: 60.69932370160716 [2025-04-11 16:04:07,218] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33192.91 | forward: 145874.34 | backward_microstep: 390279.94 | backward: 390265.99 | backward_inner_microstep: 390245.09 | backward_inner: 390237.91 | backward_allreduce_microstep: 10.79 | backward_allreduce: 4.84 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.22 | step: 363.59 | _step_clipping: 0.15 | _step_step: 361.61 | _step_zero_grad: 0.59 | _step_check_overflow: 0.60 samples/sec: 15.926 | iteration 24250/ 143000 | elapsed time per iteration (ms): 64295.5 | learning rate: 5.632E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.327565E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 16:14:54,101] [INFO] [logging.py:60:log_dist] [Rank 0] step=24260, skipped=29, lr=[0.0005631424424169775, 0.0005631424424169775], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24260 loss: 2.3472 iter time (s): 64.688 samples/sec: 15.830 %comms: 0.0028927879103595707 %optimizer_step 0.05748744715455564 %forward: 22.999325322926556 %backward: 60.37632240958391 [2025-04-11 16:14:54,102] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33870.38 | forward: 148777.18 | backward_microstep: 390577.15 | backward: 390560.11 | backward_inner_microstep: 390540.37 | backward_inner: 390532.79 | backward_allreduce_microstep: 9.31 | backward_allreduce: 3.19 | reduce_tied_grads: 0.41 | comms: 18.71 | reduce_grads: 0.26 | step: 371.87 | _step_clipping: 0.16 | _step_step: 369.85 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 15.830 | iteration 24260/ 143000 | elapsed time per iteration (ms): 64688.3 | learning rate: 5.631E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.335137E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 16:25:31,358] [INFO] [logging.py:60:log_dist] [Rank 0] step=24270, skipped=29, lr=[0.0005631107851540536, 0.0005631107851540536], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24270 loss: 2.3378 iter time (s): 63.725 samples/sec: 16.069 %comms: 0.0029002671839014216 %optimizer_step 0.059458788376234056 %forward: 22.87581122589623 %backward: 61.22597217738698 [2025-04-11 16:25:31,359] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27716.33 | forward: 145776.24 | backward_microstep: 390175.69 | backward: 390162.87 | backward_inner_microstep: 390144.18 | backward_inner: 390137.20 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.03 | reduce_tied_grads: 0.39 | comms: 18.48 | reduce_grads: 0.23 | step: 378.90 | _step_clipping: 0.14 | _step_step: 376.85 | _step_zero_grad: 0.63 | _step_check_overflow: 0.61 samples/sec: 16.069 | iteration 24270/ 143000 | elapsed time per iteration (ms): 63725.7 | learning rate: 5.631E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.335185E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 16:36:06,504] [INFO] [logging.py:60:log_dist] [Rank 0] step=24280, skipped=29, lr=[0.0005630791151922233, 0.0005630791151922233], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24280 loss: 2.3577 iter time (s): 63.514 samples/sec: 16.122 %comms: 0.002883591043337388 %optimizer_step 0.0578748261566846 %forward: 22.89337134120395 %backward: 61.436040637017854 [2025-04-11 16:36:06,504] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25935.01 | forward: 145404.94 | backward_microstep: 390217.71 | backward: 390204.83 | backward_inner_microstep: 390186.64 | backward_inner: 390179.79 | backward_allreduce_microstep: 8.65 | backward_allreduce: 2.97 | reduce_tied_grads: 0.37 | comms: 18.31 | reduce_grads: 0.24 | step: 367.59 | _step_clipping: 0.14 | _step_step: 365.55 | _step_zero_grad: 0.55 | _step_check_overflow: 0.65 samples/sec: 16.122 | iteration 24280/ 143000 | elapsed time per iteration (ms): 63514.6 | learning rate: 5.631E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.342093E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 16:46:59,774] [INFO] [logging.py:60:log_dist] [Rank 0] step=24290, skipped=29, lr=[0.0005630474325330154, 0.0005630474325330154], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24290 loss: 2.3275 iter time (s): 65.326 samples/sec: 15.675 %comms: 0.002865233163097758 %optimizer_step 0.057264955036173376 %forward: 22.39109293254585 %backward: 59.74866262430967 [2025-04-11 16:46:59,775] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 43049.20 | forward: 146272.88 | backward_microstep: 390331.93 | backward: 390316.31 | backward_inner_microstep: 390296.97 | backward_inner: 390289.73 | backward_allreduce_microstep: 9.12 | backward_allreduce: 3.11 | reduce_tied_grads: 0.35 | comms: 18.72 | reduce_grads: 0.23 | step: 374.09 | _step_clipping: 0.18 | _step_step: 371.76 | _step_zero_grad: 0.64 | _step_check_overflow: 0.80 samples/sec: 15.675 | iteration 24290/ 143000 | elapsed time per iteration (ms): 65327.0 | learning rate: 5.630E-04 | approx flops per GPU: 67.6TFLOPS | lm_loss: 2.333668E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 16:57:43,679] [INFO] [logging.py:60:log_dist] [Rank 0] step=24300, skipped=29, lr=[0.0005630157371779588, 0.0005630157371779588], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24300 loss: 2.3349 iter time (s): 64.390 samples/sec: 15.903 %comms: 0.0028041541714158603 %optimizer_step 0.05646146576970129 %forward: 22.62499272098125 %backward: 60.59395938388925 [2025-04-11 16:57:43,680] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34477.88 | forward: 145682.06 | backward_microstep: 390176.11 | backward: 390163.79 | backward_inner_microstep: 390144.78 | backward_inner: 390137.73 | backward_allreduce_microstep: 9.16 | backward_allreduce: 3.16 | reduce_tied_grads: 0.31 | comms: 18.06 | reduce_grads: 0.21 | step: 363.55 | _step_clipping: 0.11 | _step_step: 361.75 | _step_zero_grad: 0.55 | _step_check_overflow: 0.52 samples/sec: 15.903 | iteration 24300/ 143000 | elapsed time per iteration (ms): 64390.5 | learning rate: 5.630E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.339075E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 17:08:26,917] [INFO] [logging.py:60:log_dist] [Rank 0] step=24310, skipped=29, lr=[0.0005629840291285833, 0.0005629840291285833], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24310 loss: 2.3221 iter time (s): 64.323 samples/sec: 15.920 %comms: 0.002845315544621 %optimizer_step 0.05802921791165158 %forward: 22.652139761447348 %backward: 60.67240815419442 [2025-04-11 17:08:26,917] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33657.81 | forward: 145705.68 | backward_microstep: 390278.31 | backward: 390264.00 | backward_inner_microstep: 390245.11 | backward_inner: 390237.90 | backward_allreduce_microstep: 9.03 | backward_allreduce: 3.10 | reduce_tied_grads: 0.35 | comms: 18.30 | reduce_grads: 0.23 | step: 373.26 | _step_clipping: 0.16 | _step_step: 371.33 | _step_zero_grad: 0.59 | _step_check_overflow: 0.50 samples/sec: 15.919 | iteration 24310/ 143000 | elapsed time per iteration (ms): 64323.7 | learning rate: 5.630E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.326268E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 17:19:08,052] [INFO] [logging.py:60:log_dist] [Rank 0] step=24320, skipped=29, lr=[0.0005629523083864193, 0.0005629523083864193], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24320 loss: 2.3172 iter time (s): 64.113 samples/sec: 15.972 %comms: 0.002849324877941584 %optimizer_step 0.05720917485838551 %forward: 22.744783382077756 %backward: 60.87815419986943 [2025-04-11 17:19:08,053] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31351.01 | forward: 145823.58 | backward_microstep: 390329.60 | backward: 390307.97 | backward_inner_microstep: 390288.63 | backward_inner: 390281.47 | backward_allreduce_microstep: 9.24 | backward_allreduce: 3.10 | reduce_tied_grads: 0.34 | comms: 18.27 | reduce_grads: 0.22 | step: 366.79 | _step_clipping: 0.12 | _step_step: 364.98 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 15.972 | iteration 24320/ 143000 | elapsed time per iteration (ms): 64113.6 | learning rate: 5.630E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.324684E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 17:29:50,298] [INFO] [logging.py:60:log_dist] [Rank 0] step=24330, skipped=29, lr=[0.0005629205749529977, 0.0005629205749529977], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24330 loss: 2.3204 iter time (s): 64.224 samples/sec: 15.944 %comms: 0.0028098777610737636 %optimizer_step 0.055983949421086615 %forward: 22.667177244173896 %backward: 60.75365256777123 [2025-04-11 17:29:50,298] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32884.96 | forward: 145577.53 | backward_microstep: 390198.13 | backward: 390183.86 | backward_inner_microstep: 390165.08 | backward_inner: 390158.01 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.09 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.22 | step: 359.55 | _step_clipping: 0.13 | _step_step: 357.71 | _step_zero_grad: 0.54 | _step_check_overflow: 0.55 samples/sec: 15.944 | iteration 24330/ 143000 | elapsed time per iteration (ms): 64224.5 | learning rate: 5.629E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.320337E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 17:40:28,132] [INFO] [logging.py:60:log_dist] [Rank 0] step=24340, skipped=29, lr=[0.0005628888288298503, 0.0005628888288298503], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24340 loss: 2.3418 iter time (s): 63.783 samples/sec: 16.054 %comms: 0.002859695713670896 %optimizer_step 0.05534926400972 %forward: 22.804823190212417 %backward: 61.15419683642872 [2025-04-11 17:40:28,133] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28750.83 | forward: 145455.83 | backward_microstep: 390073.48 | backward: 390059.34 | backward_inner_microstep: 390041.06 | backward_inner: 390034.28 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.02 | reduce_tied_grads: 0.32 | comms: 18.24 | reduce_grads: 0.23 | step: 353.03 | _step_clipping: 0.12 | _step_step: 351.25 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.054 | iteration 24340/ 143000 | elapsed time per iteration (ms): 63783.5 | learning rate: 5.629E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.342461E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 17:51:13,753] [INFO] [logging.py:60:log_dist] [Rank 0] step=24350, skipped=29, lr=[0.0005628570700185092, 0.0005628570700185092], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24350 loss: 2.3475 iter time (s): 64.561 samples/sec: 15.861 %comms: 0.002962992827800278 %optimizer_step 0.056693775885159205 %forward: 22.615814739036484 %backward: 60.44670426265295 [2025-04-11 17:51:13,754] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35786.34 | forward: 146011.00 | backward_microstep: 390265.40 | backward: 390252.76 | backward_inner_microstep: 390233.97 | backward_inner: 390226.62 | backward_allreduce_microstep: 9.16 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 19.13 | reduce_grads: 0.23 | step: 366.02 | _step_clipping: 0.13 | _step_step: 364.14 | _step_zero_grad: 0.55 | _step_check_overflow: 0.57 samples/sec: 15.861 | iteration 24350/ 143000 | elapsed time per iteration (ms): 64562.1 | learning rate: 5.629E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.338428E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 18:01:50,585] [INFO] [logging.py:60:log_dist] [Rank 0] step=24360, skipped=29, lr=[0.000562825298520507, 0.000562825298520507], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24360 loss: 2.3404 iter time (s): 63.683 samples/sec: 16.080 %comms: 0.002831706381690609 %optimizer_step 0.05710327080199883 %forward: 22.839534067836198 %backward: 61.25401938156634 [2025-04-11 18:01:50,586] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27748.32 | forward: 145447.97 | backward_microstep: 390093.53 | backward: 390081.20 | backward_inner_microstep: 390062.42 | backward_inner: 390054.91 | backward_allreduce_microstep: 9.04 | backward_allreduce: 3.10 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.24 | step: 363.65 | _step_clipping: 0.13 | _step_step: 361.80 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 16.080 | iteration 24360/ 143000 | elapsed time per iteration (ms): 63683.2 | learning rate: 5.628E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.342251E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 18:12:25,165] [INFO] [logging.py:60:log_dist] [Rank 0] step=24370, skipped=29, lr=[0.0005627935143373776, 0.0005627935143373776], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24370 loss: 2.3333 iter time (s): 63.457 samples/sec: 16.137 %comms: 0.0028358896259584442 %optimizer_step 0.0545608181759485 %forward: 22.910819180044296 %backward: 61.46199890054799 [2025-04-11 18:12:25,166] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25641.63 | forward: 145386.23 | backward_microstep: 390032.27 | backward: 390022.21 | backward_inner_microstep: 390005.45 | backward_inner: 389998.73 | backward_allreduce_microstep: 8.10 | backward_allreduce: 2.79 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.21 | step: 346.23 | _step_clipping: 0.12 | _step_step: 344.59 | _step_zero_grad: 0.48 | _step_check_overflow: 0.45 samples/sec: 16.137 | iteration 24370/ 143000 | elapsed time per iteration (ms): 63458.0 | learning rate: 5.628E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.330623E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 18:23:20,924] [INFO] [logging.py:60:log_dist] [Rank 0] step=24380, skipped=29, lr=[0.0005627617174706547, 0.0005627617174706547], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24380 loss: 2.3135 iter time (s): 65.575 samples/sec: 15.616 %comms: 0.002778658711462821 %optimizer_step 0.0562114385888211 %forward: 22.312196356774674 %backward: 59.54377212614936 [2025-04-11 18:23:20,925] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 45360.32 | forward: 146312.91 | backward_microstep: 390476.02 | backward: 390460.12 | backward_inner_microstep: 390440.74 | backward_inner: 390433.27 | backward_allreduce_microstep: 9.10 | backward_allreduce: 3.14 | reduce_tied_grads: 0.33 | comms: 18.22 | reduce_grads: 0.23 | step: 368.61 | _step_clipping: 0.13 | _step_step: 366.54 | _step_zero_grad: 0.76 | _step_check_overflow: 0.52 samples/sec: 15.615 | iteration 24380/ 143000 | elapsed time per iteration (ms): 65575.9 | learning rate: 5.628E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.321102E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 18:33:54,375] [INFO] [logging.py:60:log_dist] [Rank 0] step=24390, skipped=29, lr=[0.0005627299079218731, 0.0005627299079218731], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24390 loss: 2.3204 iter time (s): 63.344 samples/sec: 16.166 %comms: 0.0028562672550754113 %optimizer_step 0.05942042342075571 %forward: 22.97848932735869 %backward: 61.551379645680235 [2025-04-11 18:33:54,376] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24441.47 | forward: 145556.00 | backward_microstep: 389903.01 | backward: 389893.89 | backward_inner_microstep: 389874.84 | backward_inner: 389868.04 | backward_allreduce_microstep: 9.47 | backward_allreduce: 3.52 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.24 | step: 376.40 | _step_clipping: 0.15 | _step_step: 374.38 | _step_zero_grad: 0.59 | _step_check_overflow: 0.64 samples/sec: 16.165 | iteration 24390/ 143000 | elapsed time per iteration (ms): 63345.1 | learning rate: 5.627E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.333733E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 18:44:31,972] [INFO] [logging.py:60:log_dist] [Rank 0] step=24400, skipped=29, lr=[0.0005626980856925679, 0.0005626980856925679], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24400 loss: 2.3364 iter time (s): 63.759 samples/sec: 16.060 %comms: 0.002843002671705675 %optimizer_step 0.05687923638233833 %forward: 22.799042780755506 %backward: 61.16413549104496 [2025-04-11 18:44:31,973] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28709.93 | forward: 145364.62 | backward_microstep: 389986.24 | backward: 389976.96 | backward_inner_microstep: 389958.32 | backward_inner: 389951.53 | backward_allreduce_microstep: 9.14 | backward_allreduce: 3.17 | reduce_tied_grads: 0.31 | comms: 18.13 | reduce_grads: 0.23 | step: 362.66 | _step_clipping: 0.12 | _step_step: 360.60 | _step_zero_grad: 0.57 | _step_check_overflow: 0.75 samples/sec: 16.060 | iteration 24400/ 143000 | elapsed time per iteration (ms): 63759.8 | learning rate: 5.627E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.339748E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 18:55:39,175] [INFO] [logging.py:60:log_dist] [Rank 0] step=24410, skipped=29, lr=[0.0005626662507842752, 0.0005626662507842752], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24410 loss: 2.3515 iter time (s): 66.719 samples/sec: 15.348 %comms: 0.0027306137396751868 %optimizer_step 0.05743483332277487 %forward: 21.92782585730321 %backward: 58.505461858907424 [2025-04-11 18:55:39,175] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 56913.12 | forward: 146301.35 | backward_microstep: 390362.45 | backward: 390345.49 | backward_inner_microstep: 390325.27 | backward_inner: 390317.75 | backward_allreduce_microstep: 9.56 | backward_allreduce: 3.26 | reduce_tied_grads: 0.36 | comms: 18.22 | reduce_grads: 0.24 | step: 383.20 | _step_clipping: 0.15 | _step_step: 381.04 | _step_zero_grad: 0.69 | _step_check_overflow: 0.61 samples/sec: 15.348 | iteration 24410/ 143000 | elapsed time per iteration (ms): 66720.2 | learning rate: 5.627E-04 | approx flops per GPU: 66.2TFLOPS | lm_loss: 2.330477E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 19:06:11,399] [INFO] [logging.py:60:log_dist] [Rank 0] step=24420, skipped=29, lr=[0.0005626344031985314, 0.0005626344031985314], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24420 loss: 2.3192 iter time (s): 63.222 samples/sec: 16.197 %comms: 0.0028959359344996553 %optimizer_step 0.05614394349271445 %forward: 22.98555898962518 %backward: 61.69854976783865 [2025-04-11 19:06:11,400] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23283.82 | forward: 145318.93 | backward_microstep: 390079.07 | backward: 390069.58 | backward_inner_microstep: 390052.71 | backward_inner: 390046.40 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.80 | reduce_tied_grads: 0.27 | comms: 18.31 | reduce_grads: 0.18 | step: 354.95 | _step_clipping: 0.12 | _step_step: 353.15 | _step_zero_grad: 0.48 | _step_check_overflow: 0.62 samples/sec: 16.197 | iteration 24420/ 143000 | elapsed time per iteration (ms): 63222.4 | learning rate: 5.626E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.336536E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 19:16:49,018] [INFO] [logging.py:60:log_dist] [Rank 0] step=24430, skipped=29, lr=[0.0005626025429368737, 0.0005626025429368737], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24430 loss: 2.3365 iter time (s): 63.761 samples/sec: 16.060 %comms: 0.0028690388364969724 %optimizer_step 0.055723772937314495 %forward: 22.874176124096017 %backward: 61.19348898573493 [2025-04-11 19:16:49,019] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28019.67 | forward: 145848.85 | backward_microstep: 390190.88 | backward: 390177.99 | backward_inner_microstep: 390158.95 | backward_inner: 390152.45 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.87 | reduce_tied_grads: 0.30 | comms: 18.29 | reduce_grads: 0.23 | step: 355.30 | _step_clipping: 0.12 | _step_step: 353.45 | _step_zero_grad: 0.49 | _step_check_overflow: 0.63 samples/sec: 16.060 | iteration 24430/ 143000 | elapsed time per iteration (ms): 63761.9 | learning rate: 5.626E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.332448E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 19:27:30,765] [INFO] [logging.py:60:log_dist] [Rank 0] step=24440, skipped=29, lr=[0.0005625706700008396, 0.0005625706700008396], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24440 loss: 2.3198 iter time (s): 64.174 samples/sec: 15.957 %comms: 0.0028421548795136723 %optimizer_step 0.05721784845980004 %forward: 22.685544629830034 %backward: 60.771172972175805 [2025-04-11 19:27:30,766] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32625.18 | forward: 145582.33 | backward_microstep: 390002.22 | backward: 389993.25 | backward_inner_microstep: 389973.28 | backward_inner: 389966.41 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.05 | reduce_tied_grads: 0.35 | comms: 18.24 | reduce_grads: 0.23 | step: 367.19 | _step_clipping: 0.14 | _step_step: 365.20 | _step_zero_grad: 0.60 | _step_check_overflow: 0.60 samples/sec: 15.956 | iteration 24440/ 143000 | elapsed time per iteration (ms): 64174.7 | learning rate: 5.626E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.327040E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 19:38:11,431] [INFO] [logging.py:60:log_dist] [Rank 0] step=24450, skipped=29, lr=[0.0005625387843919676, 0.0005625387843919676], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24450 loss: 2.3557 iter time (s): 64.066 samples/sec: 15.984 %comms: 0.002840362778246481 %optimizer_step 0.0581475001157363 %forward: 22.71274079501173 %backward: 60.88139041186853 [2025-04-11 19:38:11,432] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31549.83 | forward: 145511.38 | backward_microstep: 390052.95 | backward: 390042.53 | backward_inner_microstep: 390023.23 | backward_inner: 390016.25 | backward_allreduce_microstep: 9.55 | backward_allreduce: 3.37 | reduce_tied_grads: 0.37 | comms: 18.20 | reduce_grads: 0.25 | step: 372.53 | _step_clipping: 0.14 | _step_step: 370.49 | _step_zero_grad: 0.61 | _step_check_overflow: 0.63 samples/sec: 15.983 | iteration 24450/ 143000 | elapsed time per iteration (ms): 64066.6 | learning rate: 5.625E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.347383E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 19:48:43,892] [INFO] [logging.py:60:log_dist] [Rank 0] step=24460, skipped=29, lr=[0.0005625068861117966, 0.0005625068861117966], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24460 loss: 2.3340 iter time (s): 63.246 samples/sec: 16.191 %comms: 0.0028359287767018466 %optimizer_step 0.05503897316465876 %forward: 22.98305787295299 %backward: 61.667626885533664 [2025-04-11 19:48:43,893] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23569.35 | forward: 145357.64 | backward_microstep: 390030.67 | backward: 390020.38 | backward_inner_microstep: 390002.59 | backward_inner: 389996.08 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.92 | reduce_tied_grads: 0.32 | comms: 17.94 | reduce_grads: 0.20 | step: 348.10 | _step_clipping: 0.13 | _step_step: 346.42 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.191 | iteration 24460/ 143000 | elapsed time per iteration (ms): 63246.1 | learning rate: 5.625E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.334709E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 19:59:31,191] [INFO] [logging.py:60:log_dist] [Rank 0] step=24470, skipped=29, lr=[0.0005624749751618662, 0.0005624749751618662], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24470 loss: 2.3278 iter time (s): 64.729 samples/sec: 15.820 %comms: 0.002835755557504084 %optimizer_step 0.06092169000160997 %forward: 22.548596855229043 %backward: 60.291835133508634 [2025-04-11 19:59:31,192] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37456.84 | forward: 145955.18 | backward_microstep: 390280.66 | backward: 390263.99 | backward_inner_microstep: 390241.62 | backward_inner: 390234.16 | backward_allreduce_microstep: 10.32 | backward_allreduce: 3.64 | reduce_tied_grads: 0.37 | comms: 18.36 | reduce_grads: 0.25 | step: 394.34 | _step_clipping: 0.15 | _step_step: 390.25 | _step_zero_grad: 0.69 | _step_check_overflow: 0.67 samples/sec: 15.820 | iteration 24470/ 143000 | elapsed time per iteration (ms): 64729.8 | learning rate: 5.625E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.329399E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 20:10:12,950] [INFO] [logging.py:60:log_dist] [Rank 0] step=24480, skipped=29, lr=[0.0005624430515437165, 0.0005624430515437165], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24480 loss: 2.3233 iter time (s): 64.175 samples/sec: 15.956 %comms: 0.0028725414167578077 %optimizer_step 0.057303709038591735 %forward: 22.715096009954486 %backward: 60.807455617792705 [2025-04-11 20:10:12,951] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32168.70 | forward: 145774.05 | backward_microstep: 390244.38 | backward: 390231.63 | backward_inner_microstep: 390212.03 | backward_inner: 390204.91 | backward_allreduce_microstep: 9.50 | backward_allreduce: 3.25 | reduce_tied_grads: 0.37 | comms: 18.43 | reduce_grads: 0.25 | step: 367.75 | _step_clipping: 0.14 | _step_step: 365.59 | _step_zero_grad: 0.60 | _step_check_overflow: 0.73 samples/sec: 15.956 | iteration 24480/ 143000 | elapsed time per iteration (ms): 64176.0 | learning rate: 5.624E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.321721E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 20:21:02,904] [INFO] [logging.py:60:log_dist] [Rank 0] step=24490, skipped=29, lr=[0.0005624111152588882, 0.0005624111152588882], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24490 loss: 2.3279 iter time (s): 64.995 samples/sec: 15.755 %comms: 0.0027804782509749863 %optimizer_step 0.05390876982560998 %forward: 22.40071797246895 %backward: 60.01555422232874 [2025-04-11 20:21:02,905] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40742.61 | forward: 145592.93 | backward_microstep: 390080.74 | backward: 390069.65 | backward_inner_microstep: 390046.42 | backward_inner: 390039.78 | backward_allreduce_microstep: 13.89 | backward_allreduce: 2.90 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.19 | step: 350.38 | _step_clipping: 0.11 | _step_step: 348.76 | _step_zero_grad: 0.47 | _step_check_overflow: 0.48 samples/sec: 15.755 | iteration 24490/ 143000 | elapsed time per iteration (ms): 64995.3 | learning rate: 5.624E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.333572E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 20:31:37,376] [INFO] [logging.py:60:log_dist] [Rank 0] step=24500, skipped=29, lr=[0.000562379166308923, 0.000562379166308923], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24500 loss: 2.3383 iter time (s): 63.447 samples/sec: 16.140 %comms: 0.0028281434407282082 %optimizer_step 0.05773597283833017 %forward: 22.950606707602205 %backward: 61.460494954097165 [2025-04-11 20:31:37,376] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25387.13 | forward: 145613.89 | backward_microstep: 389955.47 | backward: 389946.21 | backward_inner_microstep: 389928.45 | backward_inner: 389922.00 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.88 | reduce_tied_grads: 0.27 | comms: 17.94 | reduce_grads: 0.19 | step: 366.32 | _step_clipping: 0.12 | _step_step: 364.57 | _step_zero_grad: 0.49 | _step_check_overflow: 0.57 samples/sec: 16.139 | iteration 24500/ 143000 | elapsed time per iteration (ms): 63447.2 | learning rate: 5.624E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.331859E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 20:42:14,015] [INFO] [logging.py:60:log_dist] [Rank 0] step=24510, skipped=29, lr=[0.0005623472046953626, 0.0005623472046953626], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24510 loss: 2.3167 iter time (s): 63.663 samples/sec: 16.085 %comms: 0.0028603489189839154 %optimizer_step 0.0579064302736774 %forward: 22.86896147609176 %backward: 61.32480162907451 [2025-04-11 20:42:14,015] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27037.52 | forward: 145591.43 | backward_microstep: 390429.99 | backward: 390414.12 | backward_inner_microstep: 390395.66 | backward_inner: 390388.26 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.06 | reduce_tied_grads: 0.35 | comms: 18.21 | reduce_grads: 0.23 | step: 368.65 | _step_clipping: 0.13 | _step_step: 366.76 | _step_zero_grad: 0.50 | _step_check_overflow: 0.63 samples/sec: 16.084 | iteration 24510/ 143000 | elapsed time per iteration (ms): 63663.9 | learning rate: 5.623E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.328204E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 20:52:58,464] [INFO] [logging.py:60:log_dist] [Rank 0] step=24520, skipped=29, lr=[0.0005623152304197497, 0.0005623152304197497], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24520 loss: 2.3267 iter time (s): 64.444 samples/sec: 15.890 %comms: 0.002802304070492649 %optimizer_step 0.055626213048539895 %forward: 22.58730284934313 %backward: 60.52254292377355 [2025-04-11 20:52:58,464] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35298.07 | forward: 145562.30 | backward_microstep: 390043.09 | backward: 390033.32 | backward_inner_microstep: 390014.82 | backward_inner: 390007.97 | backward_allreduce_microstep: 8.97 | backward_allreduce: 3.07 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.21 | step: 358.48 | _step_clipping: 0.12 | _step_step: 356.51 | _step_zero_grad: 0.53 | _step_check_overflow: 0.69 samples/sec: 15.890 | iteration 24520/ 143000 | elapsed time per iteration (ms): 64444.9 | learning rate: 5.623E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.326373E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 21:03:39,201] [INFO] [logging.py:60:log_dist] [Rank 0] step=24530, skipped=29, lr=[0.0005622832434836274, 0.0005622832434836274], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24530 loss: 2.3329 iter time (s): 64.073 samples/sec: 15.982 %comms: 0.0028606621553086526 %optimizer_step 0.05783804315624963 %forward: 22.742329848293714 %backward: 60.912269567040866 [2025-04-11 21:03:39,202] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31127.47 | forward: 145717.11 | backward_microstep: 390297.83 | backward: 390283.67 | backward_inner_microstep: 390264.11 | backward_inner: 390256.81 | backward_allreduce_microstep: 9.46 | backward_allreduce: 3.24 | reduce_tied_grads: 0.36 | comms: 18.33 | reduce_grads: 0.24 | step: 370.59 | _step_clipping: 0.14 | _step_step: 368.48 | _step_zero_grad: 0.62 | _step_check_overflow: 0.66 samples/sec: 15.982 | iteration 24530/ 143000 | elapsed time per iteration (ms): 64073.7 | learning rate: 5.623E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.333194E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 21:14:29,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=24540, skipped=29, lr=[0.0005622512438885397, 0.0005622512438885397], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24540 loss: 2.3463 iter time (s): 64.999 samples/sec: 15.754 %comms: 0.0028305727508272276 %optimizer_step 0.059032420205935354 %forward: 22.460260217899318 %backward: 60.05694327858991 [2025-04-11 21:14:29,201] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40053.15 | forward: 145990.11 | backward_microstep: 390380.08 | backward: 390365.90 | backward_inner_microstep: 390346.33 | backward_inner: 390339.01 | backward_allreduce_microstep: 9.40 | backward_allreduce: 3.30 | reduce_tied_grads: 0.39 | comms: 18.40 | reduce_grads: 0.24 | step: 383.71 | _step_clipping: 0.13 | _step_step: 381.61 | _step_zero_grad: 0.57 | _step_check_overflow: 0.73 samples/sec: 15.754 | iteration 24540/ 143000 | elapsed time per iteration (ms): 64999.9 | learning rate: 5.623E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.333081E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 21:25:10,367] [INFO] [logging.py:60:log_dist] [Rank 0] step=24550, skipped=29, lr=[0.0005622192316360311, 0.0005622192316360311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24550 loss: 2.3504 iter time (s): 64.116 samples/sec: 15.971 %comms: 0.002811256082236164 %optimizer_step 0.05702022061894814 %forward: 22.747420507627144 %backward: 60.84158353474982 [2025-04-11 21:25:10,368] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31675.17 | forward: 145847.63 | backward_microstep: 390104.31 | backward: 390092.62 | backward_inner_microstep: 390074.77 | backward_inner: 390067.80 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.93 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.21 | step: 365.59 | _step_clipping: 0.14 | _step_step: 363.75 | _step_zero_grad: 0.54 | _step_check_overflow: 0.56 samples/sec: 15.971 | iteration 24550/ 143000 | elapsed time per iteration (ms): 64116.7 | learning rate: 5.622E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.335778E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 21:35:51,437] [INFO] [logging.py:60:log_dist] [Rank 0] step=24560, skipped=29, lr=[0.0005621872067276465, 0.0005621872067276465], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24560 loss: 2.3314 iter time (s): 64.106 samples/sec: 15.973 %comms: 0.002855645279381706 %optimizer_step 0.05727595142987537 %forward: 22.693348369472478 %backward: 60.86956800187381 [2025-04-11 21:35:51,437] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31813.40 | forward: 145478.73 | backward_microstep: 390225.69 | backward: 390212.46 | backward_inner_microstep: 390193.52 | backward_inner: 390186.30 | backward_allreduce_microstep: 8.99 | backward_allreduce: 3.08 | reduce_tied_grads: 0.40 | comms: 18.31 | reduce_grads: 0.26 | step: 367.18 | _step_clipping: 0.16 | _step_step: 365.22 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 15.973 | iteration 24560/ 143000 | elapsed time per iteration (ms): 64106.9 | learning rate: 5.622E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.324614E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 21:46:22,832] [INFO] [logging.py:60:log_dist] [Rank 0] step=24570, skipped=29, lr=[0.0005621551691649316, 0.0005621551691649316], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24570 loss: 2.3163 iter time (s): 63.139 samples/sec: 16.218 %comms: 0.0028541214451082222 %optimizer_step 0.05528516011113234 %forward: 23.037083347617816 %backward: 61.78986899482501 [2025-04-11 21:46:22,832] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22283.21 | forward: 145453.78 | backward_microstep: 390147.24 | backward: 390134.89 | backward_inner_microstep: 390117.67 | backward_inner: 390110.80 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.79 | reduce_tied_grads: 0.29 | comms: 18.02 | reduce_grads: 0.19 | step: 349.06 | _step_clipping: 0.12 | _step_step: 347.39 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 16.218 | iteration 24570/ 143000 | elapsed time per iteration (ms): 63139.5 | learning rate: 5.622E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.324118E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 21:56:53,625] [INFO] [logging.py:60:log_dist] [Rank 0] step=24580, skipped=29, lr=[0.0005621231189494328, 0.0005621231189494328], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24580 loss: 2.3254 iter time (s): 63.079 samples/sec: 16.234 %comms: 0.0028973239785820798 %optimizer_step 0.055465488821473605 %forward: 23.05465073055199 %backward: 61.841831593462345 [2025-04-11 21:56:53,626] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21743.49 | forward: 145426.00 | backward_microstep: 390102.07 | backward: 390090.94 | backward_inner_microstep: 390071.20 | backward_inner: 390064.65 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.87 | reduce_tied_grads: 0.29 | comms: 18.28 | reduce_grads: 0.19 | step: 349.87 | _step_clipping: 0.11 | _step_step: 348.02 | _step_zero_grad: 0.50 | _step_check_overflow: 0.65 samples/sec: 16.234 | iteration 24580/ 143000 | elapsed time per iteration (ms): 63079.4 | learning rate: 5.621E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.323365E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 22:07:27,620] [INFO] [logging.py:60:log_dist] [Rank 0] step=24590, skipped=29, lr=[0.0005620910560826968, 0.0005620910560826968], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24590 loss: 2.3231 iter time (s): 63.399 samples/sec: 16.152 %comms: 0.002820646368009775 %optimizer_step 0.05454883003580544 %forward: 22.949265097144906 %backward: 61.51204854204351 [2025-04-11 22:07:27,620] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24999.55 | forward: 145495.80 | backward_microstep: 389988.70 | backward: 389979.57 | backward_inner_microstep: 389962.68 | backward_inner: 389956.18 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.78 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.19 | step: 345.83 | _step_clipping: 0.12 | _step_step: 344.18 | _step_zero_grad: 0.46 | _step_check_overflow: 0.53 samples/sec: 16.152 | iteration 24590/ 143000 | elapsed time per iteration (ms): 63399.4 | learning rate: 5.621E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.331114E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 22:17:53,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=24600, skipped=29, lr=[0.0005620589805662711, 0.0005620589805662711], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24600 loss: 2.3184 iter time (s): 62.539 samples/sec: 16.374 %comms: 0.003537769893470775 %optimizer_step 0.05761923790698932 %forward: 23.212142257291948 %backward: 62.35810473512225 [2025-04-11 22:17:53,014] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16722.37 | forward: 145165.81 | backward_microstep: 389989.88 | backward: 389979.73 | backward_inner_microstep: 389963.38 | backward_inner: 389957.13 | backward_allreduce_microstep: 7.89 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 22.12 | reduce_grads: 0.20 | step: 360.34 | _step_clipping: 0.12 | _step_step: 358.49 | _step_zero_grad: 0.49 | _step_check_overflow: 0.64 samples/sec: 16.374 | iteration 24600/ 143000 | elapsed time per iteration (ms): 62539.3 | learning rate: 5.621E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.334888E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 22:28:46,312] [INFO] [logging.py:60:log_dist] [Rank 0] step=24610, skipped=29, lr=[0.000562026892401704, 0.000562026892401704], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24610 loss: 2.3109 iter time (s): 65.329 samples/sec: 15.674 %comms: 0.0028431366816812735 %optimizer_step 0.057304212671406084 %forward: 22.3504141862577 %backward: 59.74039921673159 [2025-04-11 22:28:46,313] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 43394.16 | forward: 146013.59 | backward_microstep: 390296.04 | backward: 390279.56 | backward_inner_microstep: 390260.65 | backward_inner: 390253.43 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.08 | reduce_tied_grads: 0.39 | comms: 18.57 | reduce_grads: 0.24 | step: 374.36 | _step_clipping: 0.15 | _step_step: 372.12 | _step_zero_grad: 0.66 | _step_check_overflow: 0.75 samples/sec: 15.674 | iteration 24610/ 143000 | elapsed time per iteration (ms): 65330.0 | learning rate: 5.620E-04 | approx flops per GPU: 67.6TFLOPS | lm_loss: 2.329750E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 22:39:34,612] [INFO] [logging.py:60:log_dist] [Rank 0] step=24620, skipped=29, lr=[0.0005619947915905441, 0.0005619947915905441], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24620 loss: 2.3152 iter time (s): 64.829 samples/sec: 15.795 %comms: 0.0028580719951994834 %optimizer_step 0.056564817604843796 %forward: 22.48691312682844 %backward: 60.190595540854574 [2025-04-11 22:39:34,613] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38746.75 | forward: 145781.23 | backward_microstep: 390224.51 | backward: 390211.82 | backward_inner_microstep: 390189.83 | backward_inner: 390182.82 | backward_allreduce_microstep: 10.67 | backward_allreduce: 3.07 | reduce_tied_grads: 0.33 | comms: 18.53 | reduce_grads: 0.22 | step: 366.71 | _step_clipping: 0.12 | _step_step: 364.59 | _step_zero_grad: 0.62 | _step_check_overflow: 0.71 samples/sec: 15.795 | iteration 24620/ 143000 | elapsed time per iteration (ms): 64830.0 | learning rate: 5.620E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.325708E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 22:50:11,603] [INFO] [logging.py:60:log_dist] [Rank 0] step=24630, skipped=29, lr=[0.0005619626781343407, 0.0005619626781343407], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24630 loss: 2.3451 iter time (s): 63.698 samples/sec: 16.076 %comms: 0.0028707510327827023 %optimizer_step 0.05709282940046886 %forward: 22.85851477184716 %backward: 61.29169096499204 [2025-04-11 22:50:11,604] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27363.79 | forward: 145605.11 | backward_microstep: 390433.61 | backward: 390418.34 | backward_inner_microstep: 390399.68 | backward_inner: 390392.34 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.04 | reduce_tied_grads: 0.36 | comms: 18.29 | reduce_grads: 0.24 | step: 363.67 | _step_clipping: 0.15 | _step_step: 361.77 | _step_zero_grad: 0.55 | _step_check_overflow: 0.53 samples/sec: 16.076 | iteration 24630/ 143000 | elapsed time per iteration (ms): 63699.0 | learning rate: 5.620E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.333488E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 23:00:46,529] [INFO] [logging.py:60:log_dist] [Rank 0] step=24640, skipped=29, lr=[0.0005619305520346438, 0.0005619305520346438], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24640 loss: 2.3578 iter time (s): 63.492 samples/sec: 16.128 %comms: 0.002890672694879059 %optimizer_step 0.05677464421414218 %forward: 22.937666576422203 %backward: 61.512159920731314 [2025-04-11 23:00:46,530] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25147.13 | forward: 145635.86 | backward_microstep: 390570.88 | backward: 390553.07 | backward_inner_microstep: 390534.15 | backward_inner: 390526.66 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.35 | reduce_grads: 0.21 | step: 360.47 | _step_clipping: 0.14 | _step_step: 358.53 | _step_zero_grad: 0.55 | _step_check_overflow: 0.61 samples/sec: 16.128 | iteration 24640/ 143000 | elapsed time per iteration (ms): 63492.6 | learning rate: 5.619E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.339604E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 23:11:32,804] [INFO] [logging.py:60:log_dist] [Rank 0] step=24650, skipped=29, lr=[0.000561898413293004, 0.000561898413293004], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24650 loss: 2.3288 iter time (s): 64.627 samples/sec: 15.845 %comms: 0.0028242692426528495 %optimizer_step 0.05701595907647296 %forward: 22.536578743191185 %backward: 60.36803873854403 [2025-04-11 23:11:32,805] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36943.58 | forward: 145646.89 | backward_microstep: 390152.51 | backward: 390139.84 | backward_inner_microstep: 390120.36 | backward_inner: 390113.24 | backward_allreduce_microstep: 9.53 | backward_allreduce: 3.26 | reduce_tied_grads: 0.38 | comms: 18.25 | reduce_grads: 0.27 | step: 368.48 | _step_clipping: 0.16 | _step_step: 366.32 | _step_zero_grad: 0.63 | _step_check_overflow: 0.67 samples/sec: 15.845 | iteration 24650/ 143000 | elapsed time per iteration (ms): 64627.5 | learning rate: 5.619E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.331656E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 23:22:19,265] [INFO] [logging.py:60:log_dist] [Rank 0] step=24660, skipped=29, lr=[0.0005618662619109724, 0.0005618662619109724], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24660 loss: 2.3231 iter time (s): 64.645 samples/sec: 15.840 %comms: 0.0028035048310600837 %optimizer_step 0.05653940661232499 %forward: 22.542670702591458 %backward: 60.34512873817736 [2025-04-11 23:22:19,266] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37102.78 | forward: 145728.15 | backward_microstep: 390115.24 | backward: 390103.91 | backward_inner_microstep: 390085.16 | backward_inner: 390077.94 | backward_allreduce_microstep: 8.94 | backward_allreduce: 3.10 | reduce_tied_grads: 0.35 | comms: 18.12 | reduce_grads: 0.23 | step: 365.50 | _step_clipping: 0.14 | _step_step: 363.64 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 15.840 | iteration 24660/ 143000 | elapsed time per iteration (ms): 64646.1 | learning rate: 5.619E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.321981E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 23:32:55,958] [INFO] [logging.py:60:log_dist] [Rank 0] step=24670, skipped=29, lr=[0.0005618340978901005, 0.0005618340978901005], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24670 loss: 2.3333 iter time (s): 63.669 samples/sec: 16.083 %comms: 0.0028956821539295337 %optimizer_step 0.05662378983277929 %forward: 22.859715428424067 %backward: 61.29007916953334 [2025-04-11 23:32:55,959] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27360.56 | forward: 145544.84 | backward_microstep: 390239.41 | backward: 390225.97 | backward_inner_microstep: 390206.98 | backward_inner: 390199.62 | backward_allreduce_microstep: 9.10 | backward_allreduce: 3.13 | reduce_tied_grads: 0.38 | comms: 18.44 | reduce_grads: 0.24 | step: 360.52 | _step_clipping: 0.14 | _step_step: 358.61 | _step_zero_grad: 0.61 | _step_check_overflow: 0.49 samples/sec: 16.083 | iteration 24670/ 143000 | elapsed time per iteration (ms): 63669.3 | learning rate: 5.618E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.325888E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 23:43:43,270] [INFO] [logging.py:60:log_dist] [Rank 0] step=24680, skipped=29, lr=[0.0005618019212319412, 0.0005618019212319412], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24680 loss: 2.3298 iter time (s): 64.731 samples/sec: 15.819 %comms: 0.002818861274674969 %optimizer_step 0.057162935375788704 %forward: 22.517558374614417 %backward: 60.28047962869339 [2025-04-11 23:43:43,271] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37789.20 | forward: 145757.45 | backward_microstep: 390212.68 | backward: 390199.00 | backward_inner_microstep: 390179.54 | backward_inner: 390172.32 | backward_allreduce_microstep: 9.43 | backward_allreduce: 3.28 | reduce_tied_grads: 0.31 | comms: 18.25 | reduce_grads: 0.21 | step: 370.02 | _step_clipping: 0.12 | _step_step: 368.12 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 15.819 | iteration 24680/ 143000 | elapsed time per iteration (ms): 64731.2 | learning rate: 5.618E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.350584E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-11 23:54:28,431] [INFO] [logging.py:60:log_dist] [Rank 0] step=24690, skipped=29, lr=[0.0005617697319380471, 0.0005617697319380471], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24690 loss: 2.3199 iter time (s): 64.515 samples/sec: 15.872 %comms: 0.002896552302678703 %optimizer_step 0.05687235212186776 %forward: 22.61898191877669 %backward: 60.49603853604522 [2025-04-11 23:54:28,432] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35377.10 | forward: 145927.46 | backward_microstep: 390306.43 | backward: 390293.14 | backward_inner_microstep: 390274.06 | backward_inner: 390266.73 | backward_allreduce_microstep: 9.14 | backward_allreduce: 3.17 | reduce_tied_grads: 0.38 | comms: 18.69 | reduce_grads: 0.25 | step: 366.91 | _step_clipping: 0.17 | _step_step: 364.92 | _step_zero_grad: 0.58 | _step_check_overflow: 0.57 samples/sec: 15.872 | iteration 24690/ 143000 | elapsed time per iteration (ms): 64516.1 | learning rate: 5.618E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.338722E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 00:05:16,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=24700, skipped=29, lr=[0.000561737530009972, 0.000561737530009972], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24700 loss: 2.3244 iter time (s): 64.815 samples/sec: 15.799 %comms: 0.002791151324694805 %optimizer_step 0.05651431271070167 %forward: 22.474155399619406 %backward: 60.19156572865606 [2025-04-12 00:05:16,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38834.68 | forward: 145667.17 | backward_microstep: 390146.13 | backward: 390134.14 | backward_inner_microstep: 390115.08 | backward_inner: 390107.93 | backward_allreduce_microstep: 9.08 | backward_allreduce: 3.12 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.24 | step: 366.30 | _step_clipping: 0.14 | _step_step: 364.29 | _step_zero_grad: 0.63 | _step_check_overflow: 0.54 samples/sec: 15.799 | iteration 24700/ 143000 | elapsed time per iteration (ms): 64816.1 | learning rate: 5.617E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.339447E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 00:15:56,749] [INFO] [logging.py:60:log_dist] [Rank 0] step=24710, skipped=29, lr=[0.0005617053154492699, 0.0005617053154492699], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24710 loss: 2.3363 iter time (s): 64.015 samples/sec: 15.996 %comms: 0.0028123028842578753 %optimizer_step 0.05577786195534681 %forward: 22.732131663511506 %backward: 60.93387884520739 [2025-04-12 00:15:56,750] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31075.74 | forward: 145519.98 | backward_microstep: 390079.27 | backward: 390068.88 | backward_inner_microstep: 390049.75 | backward_inner: 390042.70 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.20 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.22 | step: 357.06 | _step_clipping: 0.15 | _step_step: 355.02 | _step_zero_grad: 0.54 | _step_check_overflow: 0.70 samples/sec: 15.996 | iteration 24710/ 143000 | elapsed time per iteration (ms): 64015.7 | learning rate: 5.617E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.347254E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 00:26:42,854] [INFO] [logging.py:60:log_dist] [Rank 0] step=24720, skipped=29, lr=[0.0005616730882574959, 0.0005616730882574959], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24720 loss: 2.3297 iter time (s): 64.610 samples/sec: 15.849 %comms: 0.0029370108940583307 %optimizer_step 0.05854298046451471 %forward: 22.56477476583356 %backward: 60.373896693836905 [2025-04-12 00:26:42,855] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36686.74 | forward: 145790.59 | backward_microstep: 390085.67 | backward: 390074.63 | backward_inner_microstep: 390055.86 | backward_inner: 390049.04 | backward_allreduce_microstep: 9.22 | backward_allreduce: 3.30 | reduce_tied_grads: 0.36 | comms: 18.98 | reduce_grads: 0.23 | step: 378.25 | _step_clipping: 0.16 | _step_step: 376.30 | _step_zero_grad: 0.58 | _step_check_overflow: 0.56 samples/sec: 15.849 | iteration 24720/ 143000 | elapsed time per iteration (ms): 64610.5 | learning rate: 5.617E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.339532E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 00:37:13,102] [INFO] [logging.py:60:log_dist] [Rank 0] step=24730, skipped=29, lr=[0.0005616408484362052, 0.0005616408484362052], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24730 loss: 2.3228 iter time (s): 63.024 samples/sec: 16.248 %comms: 0.0028630631610160155 %optimizer_step 0.058957053983356454 %forward: 23.054541782205852 %backward: 61.89673392208444 [2025-04-12 00:37:13,103] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21335.58 | forward: 145299.46 | backward_microstep: 390110.51 | backward: 390099.36 | backward_inner_microstep: 390081.36 | backward_inner: 390074.65 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.04 | reduce_grads: 0.21 | step: 371.57 | _step_clipping: 0.12 | _step_step: 369.53 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 16.248 | iteration 24730/ 143000 | elapsed time per iteration (ms): 63024.8 | learning rate: 5.616E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.329866E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 00:47:57,971] [INFO] [logging.py:60:log_dist] [Rank 0] step=24740, skipped=29, lr=[0.0005616085959869539, 0.0005616085959869539], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24740 loss: 2.3476 iter time (s): 64.486 samples/sec: 15.879 %comms: 0.0028241802404244637 %optimizer_step 0.05857603520511203 %forward: 22.58677768324337 %backward: 60.50202342086274 [2025-04-12 00:47:57,972] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35489.55 | forward: 145653.66 | backward_microstep: 390168.30 | backward: 390154.88 | backward_inner_microstep: 390136.12 | backward_inner: 390129.00 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.05 | reduce_tied_grads: 0.37 | comms: 18.21 | reduce_grads: 0.26 | step: 377.73 | _step_clipping: 0.14 | _step_step: 375.67 | _step_zero_grad: 0.60 | _step_check_overflow: 0.66 samples/sec: 15.879 | iteration 24740/ 143000 | elapsed time per iteration (ms): 64486.9 | learning rate: 5.616E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.335737E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 00:59:06,819] [INFO] [logging.py:60:log_dist] [Rank 0] step=24750, skipped=29, lr=[0.0005615763309112987, 0.0005615763309112987], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24750 loss: 2.3252 iter time (s): 66.884 samples/sec: 15.310 %comms: 0.0027202542912190568 %optimizer_step 0.054558971684733275 %forward: 21.87938910558696 %backward: 58.354322822264734 [2025-04-12 00:59:06,820] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 58644.32 | forward: 146338.51 | backward_microstep: 390313.63 | backward: 390298.12 | backward_inner_microstep: 390278.43 | backward_inner: 390270.82 | backward_allreduce_microstep: 9.32 | backward_allreduce: 3.21 | reduce_tied_grads: 0.32 | comms: 18.19 | reduce_grads: 0.22 | step: 364.91 | _step_clipping: 0.14 | _step_step: 362.92 | _step_zero_grad: 0.62 | _step_check_overflow: 0.61 samples/sec: 15.310 | iteration 24750/ 143000 | elapsed time per iteration (ms): 66884.8 | learning rate: 5.616E-04 | approx flops per GPU: 66.0TFLOPS | lm_loss: 2.331225E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 01:09:54,653] [INFO] [logging.py:60:log_dist] [Rank 0] step=24760, skipped=29, lr=[0.0005615440532107969, 0.0005615440532107969], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24760 loss: 2.3643 iter time (s): 64.783 samples/sec: 15.807 %comms: 0.002860863214317232 %optimizer_step 0.056033024799494444 %forward: 22.486780199873355 %backward: 60.25028117287581 [2025-04-12 01:09:54,654] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38253.87 | forward: 145675.62 | backward_microstep: 390335.64 | backward: 390318.09 | backward_inner_microstep: 390297.88 | backward_inner: 390290.46 | backward_allreduce_microstep: 9.77 | backward_allreduce: 3.39 | reduce_tied_grads: 0.33 | comms: 18.53 | reduce_grads: 0.22 | step: 363.00 | _step_clipping: 0.13 | _step_step: 361.08 | _step_zero_grad: 0.56 | _step_check_overflow: 0.58 samples/sec: 15.807 | iteration 24760/ 143000 | elapsed time per iteration (ms): 64783.4 | learning rate: 5.615E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.336194E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 01:20:31,797] [INFO] [logging.py:60:log_dist] [Rank 0] step=24770, skipped=29, lr=[0.0005615117628870062, 0.0005615117628870062], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24770 loss: 2.3267 iter time (s): 63.714 samples/sec: 16.072 %comms: 0.002857972798009868 %optimizer_step 0.05910994989307418 %forward: 22.891717323459602 %backward: 61.25094613306417 [2025-04-12 01:20:31,798] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27489.88 | forward: 145851.74 | backward_microstep: 390265.49 | backward: 390252.80 | backward_inner_microstep: 390233.19 | backward_inner: 390226.02 | backward_allreduce_microstep: 9.38 | backward_allreduce: 3.21 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.21 | step: 376.61 | _step_clipping: 0.11 | _step_step: 374.67 | _step_zero_grad: 0.59 | _step_check_overflow: 0.62 samples/sec: 16.072 | iteration 24770/ 143000 | elapsed time per iteration (ms): 63714.4 | learning rate: 5.615E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.339104E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 01:31:17,129] [INFO] [logging.py:60:log_dist] [Rank 0] step=24780, skipped=29, lr=[0.0005614794599414852, 0.0005614794599414852], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24780 loss: 2.3275 iter time (s): 64.533 samples/sec: 15.868 %comms: 0.0028274758947233634 %optimizer_step 0.06126276588962667 %forward: 22.581083177578318 %backward: 60.47668647055448 [2025-04-12 01:31:17,130] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35745.99 | forward: 145721.40 | backward_microstep: 390285.39 | backward: 390271.25 | backward_inner_microstep: 390251.44 | backward_inner: 390243.88 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.26 | reduce_tied_grads: 0.36 | comms: 18.25 | reduce_grads: 0.25 | step: 395.34 | _step_clipping: 0.14 | _step_step: 393.32 | _step_zero_grad: 0.58 | _step_check_overflow: 0.67 samples/sec: 15.868 | iteration 24780/ 143000 | elapsed time per iteration (ms): 64533.2 | learning rate: 5.615E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.323213E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 01:42:07,965] [INFO] [logging.py:60:log_dist] [Rank 0] step=24790, skipped=29, lr=[0.0005614471443757929, 0.0005614471443757929], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24790 loss: 2.3398 iter time (s): 65.083 samples/sec: 15.734 %comms: 0.0028062372329032046 %optimizer_step 0.05833488909876584 %forward: 22.430937958247135 %backward: 59.956729480612594 [2025-04-12 01:42:07,966] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41059.00 | forward: 145987.14 | backward_microstep: 390230.08 | backward: 390216.02 | backward_inner_microstep: 390195.37 | backward_inner: 390188.24 | backward_allreduce_microstep: 10.74 | backward_allreduce: 4.82 | reduce_tied_grads: 0.38 | comms: 18.26 | reduce_grads: 0.26 | step: 379.66 | _step_clipping: 0.14 | _step_step: 377.26 | _step_zero_grad: 0.67 | _step_check_overflow: 0.81 samples/sec: 15.734 | iteration 24790/ 143000 | elapsed time per iteration (ms): 65083.7 | learning rate: 5.614E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.321602E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 01:52:50,273] [INFO] [logging.py:60:log_dist] [Rank 0] step=24800, skipped=29, lr=[0.0005614148161914891, 0.0005614148161914891], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24800 loss: 2.3314 iter time (s): 64.230 samples/sec: 15.943 %comms: 0.0028340325922434056 %optimizer_step 0.05488633493860276 %forward: 22.68721935506362 %backward: 60.738873525366664 [2025-04-12 01:52:50,274] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32942.06 | forward: 145720.24 | backward_microstep: 390140.14 | backward: 390126.40 | backward_inner_microstep: 390108.43 | backward_inner: 390101.59 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.99 | reduce_tied_grads: 0.30 | comms: 18.20 | reduce_grads: 0.20 | step: 352.54 | _step_clipping: 0.12 | _step_step: 350.66 | _step_zero_grad: 0.55 | _step_check_overflow: 0.60 samples/sec: 15.943 | iteration 24800/ 143000 | elapsed time per iteration (ms): 64230.8 | learning rate: 5.614E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.341731E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 02:03:21,457] [INFO] [logging.py:60:log_dist] [Rank 0] step=24810, skipped=29, lr=[0.0005613824753901341, 0.0005613824753901341], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24810 loss: 2.3336 iter time (s): 63.118 samples/sec: 16.224 %comms: 0.002879066818243503 %optimizer_step 0.057055489933182844 %forward: 23.027734955451432 %backward: 61.80551624004727 [2025-04-12 02:03:21,458] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22209.80 | forward: 145345.91 | backward_microstep: 390114.24 | backward: 390102.59 | backward_inner_microstep: 390080.30 | backward_inner: 390073.36 | backward_allreduce_microstep: 10.93 | backward_allreduce: 4.77 | reduce_tied_grads: 0.34 | comms: 18.17 | reduce_grads: 0.23 | step: 360.12 | _step_clipping: 0.13 | _step_step: 358.25 | _step_zero_grad: 0.50 | _step_check_overflow: 0.62 samples/sec: 16.223 | iteration 24810/ 143000 | elapsed time per iteration (ms): 63118.4 | learning rate: 5.614E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.335915E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 02:13:53,956] [INFO] [logging.py:60:log_dist] [Rank 0] step=24820, skipped=29, lr=[0.0005613501219732886, 0.0005613501219732886], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24820 loss: 2.3179 iter time (s): 63.249 samples/sec: 16.190 %comms: 0.002905162047946848 %optimizer_step 0.0571317626106862 %forward: 22.989311139091477 %backward: 61.67644432244966 [2025-04-12 02:13:53,958] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23470.01 | forward: 145405.57 | backward_microstep: 390109.72 | backward: 390098.63 | backward_inner_microstep: 390080.24 | backward_inner: 390073.15 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.04 | reduce_tied_grads: 0.37 | comms: 18.37 | reduce_grads: 0.23 | step: 361.35 | _step_clipping: 0.12 | _step_step: 359.44 | _step_zero_grad: 0.53 | _step_check_overflow: 0.62 samples/sec: 16.190 | iteration 24820/ 143000 | elapsed time per iteration (ms): 63250.0 | learning rate: 5.614E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.323901E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 02:24:42,013] [INFO] [logging.py:60:log_dist] [Rank 0] step=24830, skipped=29, lr=[0.0005613177559425145, 0.0005613177559425145], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24830 loss: 2.3530 iter time (s): 64.805 samples/sec: 15.801 %comms: 0.003081917793627466 %optimizer_step 0.06032376727756429 %forward: 22.458893678837384 %backward: 60.198578062362294 [2025-04-12 02:24:42,014] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38868.45 | forward: 145544.54 | backward_microstep: 390127.75 | backward: 390116.03 | backward_inner_microstep: 390096.72 | backward_inner: 390087.93 | backward_allreduce_microstep: 9.34 | backward_allreduce: 3.19 | reduce_tied_grads: 0.34 | comms: 19.97 | reduce_grads: 0.21 | step: 390.93 | _step_clipping: 0.13 | _step_step: 388.90 | _step_zero_grad: 0.60 | _step_check_overflow: 0.63 samples/sec: 15.801 | iteration 24830/ 143000 | elapsed time per iteration (ms): 64805.6 | learning rate: 5.613E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.346362E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 02:35:19,140] [INFO] [logging.py:60:log_dist] [Rank 0] step=24840, skipped=29, lr=[0.0005612853772993736, 0.0005612853772993736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24840 loss: 2.3310 iter time (s): 63.712 samples/sec: 16.072 %comms: 0.0028530360776270656 %optimizer_step 0.056599691339449074 %forward: 22.848883086033652 %backward: 61.24009066028915 [2025-04-12 02:35:19,141] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27854.21 | forward: 145574.86 | backward_microstep: 390186.37 | backward: 390173.03 | backward_inner_microstep: 390154.61 | backward_inner: 390147.61 | backward_allreduce_microstep: 8.87 | backward_allreduce: 3.08 | reduce_tied_grads: 0.34 | comms: 18.18 | reduce_grads: 0.23 | step: 360.61 | _step_clipping: 0.14 | _step_step: 358.74 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.072 | iteration 24840/ 143000 | elapsed time per iteration (ms): 63712.7 | learning rate: 5.613E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.329426E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 02:45:58,050] [INFO] [logging.py:60:log_dist] [Rank 0] step=24850, skipped=29, lr=[0.0005612529860454287, 0.0005612529860454287], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24850 loss: 2.3136 iter time (s): 63.890 samples/sec: 16.027 %comms: 0.0028269340380220566 %optimizer_step 0.05529845697648412 %forward: 22.773656942436872 %backward: 61.06163875374586 [2025-04-12 02:45:58,051] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29765.38 | forward: 145501.84 | backward_microstep: 390136.91 | backward: 390125.36 | backward_inner_microstep: 390105.51 | backward_inner: 390098.77 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.23 | step: 353.30 | _step_clipping: 0.13 | _step_step: 351.48 | _step_zero_grad: 0.56 | _step_check_overflow: 0.50 samples/sec: 16.027 | iteration 24850/ 143000 | elapsed time per iteration (ms): 63891.0 | learning rate: 5.613E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.339260E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 02:56:35,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=24860, skipped=29, lr=[0.0005612205821822434, 0.0005612205821822434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24860 loss: 2.3275 iter time (s): 63.792 samples/sec: 16.052 %comms: 0.0028950582669467063 %optimizer_step 0.05699569579124796 %forward: 22.82801779796803 %backward: 61.18809211649562 [2025-04-12 02:56:35,976] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28399.77 | forward: 145624.38 | backward_microstep: 390344.84 | backward: 390330.78 | backward_inner_microstep: 390312.68 | backward_inner: 390305.74 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.97 | reduce_tied_grads: 0.38 | comms: 18.47 | reduce_grads: 0.23 | step: 363.59 | _step_clipping: 0.14 | _step_step: 361.61 | _step_zero_grad: 0.58 | _step_check_overflow: 0.60 samples/sec: 16.052 | iteration 24860/ 143000 | elapsed time per iteration (ms): 63792.5 | learning rate: 5.612E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.334360E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 03:07:20,279] [INFO] [logging.py:60:log_dist] [Rank 0] step=24870, skipped=29, lr=[0.0005611881657113811, 0.0005611881657113811], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24870 loss: 2.3358 iter time (s): 64.430 samples/sec: 15.893 %comms: 0.002874616396831022 %optimizer_step 0.05760770779997946 %forward: 22.630284423552443 %backward: 60.57496445791397 [2025-04-12 03:07:20,280] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34638.89 | forward: 145806.27 | backward_microstep: 390298.19 | backward: 390282.75 | backward_inner_microstep: 390263.99 | backward_inner: 390256.93 | backward_allreduce_microstep: 8.96 | backward_allreduce: 3.08 | reduce_tied_grads: 0.34 | comms: 18.52 | reduce_grads: 0.25 | step: 371.16 | _step_clipping: 0.12 | _step_step: 369.20 | _step_zero_grad: 0.61 | _step_check_overflow: 0.53 samples/sec: 15.893 | iteration 24870/ 143000 | elapsed time per iteration (ms): 64430.4 | learning rate: 5.612E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.348268E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 03:17:53,480] [INFO] [logging.py:60:log_dist] [Rank 0] step=24880, skipped=29, lr=[0.000561155736634407, 0.000561155736634407], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24880 loss: 2.3285 iter time (s): 63.319 samples/sec: 16.172 %comms: 0.002989633349685152 %optimizer_step 0.06049599274514536 %forward: 22.994678250062876 %backward: 61.65543818030218 [2025-04-12 03:17:53,481] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23568.19 | forward: 145601.06 | backward_microstep: 390413.77 | backward: 390398.90 | backward_inner_microstep: 390379.64 | backward_inner: 390371.85 | backward_allreduce_microstep: 9.10 | backward_allreduce: 3.14 | reduce_tied_grads: 0.43 | comms: 18.93 | reduce_grads: 0.29 | step: 383.06 | _step_clipping: 0.17 | _step_step: 380.67 | _step_zero_grad: 0.69 | _step_check_overflow: 0.73 samples/sec: 16.172 | iteration 24880/ 143000 | elapsed time per iteration (ms): 63320.2 | learning rate: 5.612E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.329275E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 03:28:32,685] [INFO] [logging.py:60:log_dist] [Rank 0] step=24890, skipped=29, lr=[0.000561123294952886, 0.000561123294952886], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24890 loss: 2.3180 iter time (s): 63.920 samples/sec: 16.020 %comms: 0.0028376050100025374 %optimizer_step 0.0598449837300656 %forward: 22.765228156055667 %backward: 61.04063155520919 [2025-04-12 03:28:32,686] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29920.52 | forward: 145515.02 | backward_microstep: 390183.25 | backward: 390170.87 | backward_inner_microstep: 390151.95 | backward_inner: 390144.90 | backward_allreduce_microstep: 9.03 | backward_allreduce: 3.09 | reduce_tied_grads: 0.39 | comms: 18.14 | reduce_grads: 0.24 | step: 382.53 | _step_clipping: 0.13 | _step_step: 380.63 | _step_zero_grad: 0.57 | _step_check_overflow: 0.54 samples/sec: 16.020 | iteration 24890/ 143000 | elapsed time per iteration (ms): 63920.5 | learning rate: 5.611E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.325271E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 03:39:05,343] [INFO] [logging.py:60:log_dist] [Rank 0] step=24900, skipped=29, lr=[0.0005610908406683837, 0.0005610908406683837], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24900 loss: 2.3173 iter time (s): 63.265 samples/sec: 16.186 %comms: 0.0029521040505177325 %optimizer_step 0.057146041449626386 %forward: 22.98591495213176 %backward: 61.68455424840463 [2025-04-12 03:39:05,344] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23385.93 | forward: 145420.65 | backward_microstep: 390261.07 | backward: 390248.02 | backward_inner_microstep: 390229.75 | backward_inner: 390222.93 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.93 | reduce_tied_grads: 0.35 | comms: 18.68 | reduce_grads: 0.20 | step: 361.54 | _step_clipping: 0.13 | _step_step: 359.40 | _step_zero_grad: 0.55 | _step_check_overflow: 0.74 samples/sec: 16.186 | iteration 24900/ 143000 | elapsed time per iteration (ms): 63265.8 | learning rate: 5.611E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.316973E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 03:49:42,827] [INFO] [logging.py:60:log_dist] [Rank 0] step=24910, skipped=29, lr=[0.0005610583737824668, 0.0005610583737824668], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24910 loss: 2.3566 iter time (s): 63.748 samples/sec: 16.063 %comms: 0.0028408906489404747 %optimizer_step 0.05716268129821706 %forward: 22.817234896327857 %backward: 61.194328089244074 [2025-04-12 03:49:42,828] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28361.93 | forward: 145454.72 | backward_microstep: 390111.17 | backward: 390100.03 | backward_inner_microstep: 390080.31 | backward_inner: 390073.57 | backward_allreduce_microstep: 10.39 | backward_allreduce: 4.63 | reduce_tied_grads: 0.36 | comms: 18.11 | reduce_grads: 0.22 | step: 364.40 | _step_clipping: 0.14 | _step_step: 362.45 | _step_zero_grad: 0.56 | _step_check_overflow: 0.61 samples/sec: 16.063 | iteration 24910/ 143000 | elapsed time per iteration (ms): 63748.4 | learning rate: 5.611E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.333274E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 04:00:20,280] [INFO] [logging.py:60:log_dist] [Rank 0] step=24920, skipped=29, lr=[0.0005610258942967022, 0.0005610258942967022], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24920 loss: 2.3245 iter time (s): 63.745 samples/sec: 16.064 %comms: 0.0029168401530337883 %optimizer_step 0.058490413484450665 %forward: 22.89855996653392 %backward: 61.22575837743929 [2025-04-12 04:00:20,281] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27552.98 | forward: 145966.19 | backward_microstep: 390295.70 | backward: 390281.79 | backward_inner_microstep: 390260.89 | backward_inner: 390253.68 | backward_allreduce_microstep: 9.23 | backward_allreduce: 3.15 | reduce_tied_grads: 0.39 | comms: 18.59 | reduce_grads: 0.26 | step: 372.85 | _step_clipping: 0.14 | _step_step: 370.27 | _step_zero_grad: 0.62 | _step_check_overflow: 1.07 samples/sec: 16.064 | iteration 24920/ 143000 | elapsed time per iteration (ms): 63745.4 | learning rate: 5.610E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.339128E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 04:11:17,881] [INFO] [logging.py:60:log_dist] [Rank 0] step=24930, skipped=29, lr=[0.0005609934022126574, 0.0005609934022126574], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24930 loss: 2.3330 iter time (s): 65.759 samples/sec: 15.572 %comms: 0.002803041550975911 %optimizer_step 0.06002051877754134 %forward: 22.274013358614706 %backward: 59.368001979096995 [2025-04-12 04:11:17,882] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 47050.24 | forward: 146472.45 | backward_microstep: 390416.90 | backward: 390400.08 | backward_inner_microstep: 390379.83 | backward_inner: 390372.27 | backward_allreduce_microstep: 9.61 | backward_allreduce: 3.33 | reduce_tied_grads: 0.37 | comms: 18.43 | reduce_grads: 0.25 | step: 394.69 | _step_clipping: 0.15 | _step_step: 392.50 | _step_zero_grad: 0.66 | _step_check_overflow: 0.64 samples/sec: 15.572 | iteration 24930/ 143000 | elapsed time per iteration (ms): 65760.0 | learning rate: 5.610E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.328940E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 04:21:49,865] [INFO] [logging.py:60:log_dist] [Rank 0] step=24940, skipped=29, lr=[0.0005609608975319007, 0.0005609608975319007], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24940 loss: 2.3336 iter time (s): 63.198 samples/sec: 16.203 %comms: 0.002896396042943311 %optimizer_step 0.05762385114729185 %forward: 22.997697704866425 %backward: 61.72221212462629 [2025-04-12 04:21:49,866] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23002.11 | forward: 145340.39 | backward_microstep: 390080.68 | backward: 390070.82 | backward_inner_microstep: 390051.63 | backward_inner: 390044.63 | backward_allreduce_microstep: 9.43 | backward_allreduce: 3.25 | reduce_tied_grads: 0.58 | comms: 18.30 | reduce_grads: 0.24 | step: 364.17 | _step_clipping: 0.12 | _step_step: 362.34 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.203 | iteration 24940/ 143000 | elapsed time per iteration (ms): 63198.4 | learning rate: 5.610E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.327200E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 04:32:24,040] [INFO] [logging.py:60:log_dist] [Rank 0] step=24950, skipped=29, lr=[0.0005609283802560008, 0.0005609283802560008], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24950 loss: 2.3427 iter time (s): 63.417 samples/sec: 16.147 %comms: 0.002853607961498766 %optimizer_step 0.05793189213728537 %forward: 22.94093168855901 %backward: 61.50755577382174 [2025-04-12 04:32:24,040] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25046.11 | forward: 145484.18 | backward_microstep: 390071.15 | backward: 390061.60 | backward_inner_microstep: 390043.17 | backward_inner: 390036.49 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.03 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.23 | step: 367.39 | _step_clipping: 0.12 | _step_step: 365.55 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.147 | iteration 24950/ 143000 | elapsed time per iteration (ms): 63417.4 | learning rate: 5.609E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.340098E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 04:43:02,549] [INFO] [logging.py:60:log_dist] [Rank 0] step=24960, skipped=29, lr=[0.0005608958503865273, 0.0005608958503865273], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24960 loss: 2.3260 iter time (s): 63.850 samples/sec: 16.037 %comms: 0.0028580197567938106 %optimizer_step 0.05664177694405351 %forward: 22.748712576920713 %backward: 61.090938933341576 [2025-04-12 04:43:02,550] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29590.81 | forward: 145251.34 | backward_microstep: 390078.62 | backward: 390067.82 | backward_inner_microstep: 390049.22 | backward_inner: 390042.42 | backward_allreduce_microstep: 9.13 | backward_allreduce: 3.09 | reduce_tied_grads: 0.37 | comms: 18.25 | reduce_grads: 0.27 | step: 361.66 | _step_clipping: 0.13 | _step_step: 359.74 | _step_zero_grad: 0.52 | _step_check_overflow: 0.63 samples/sec: 16.037 | iteration 24960/ 143000 | elapsed time per iteration (ms): 63850.9 | learning rate: 5.609E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.332274E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 04:53:45,304] [INFO] [logging.py:60:log_dist] [Rank 0] step=24970, skipped=29, lr=[0.0005608633079250501, 0.0005608633079250501], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24970 loss: 2.3316 iter time (s): 64.275 samples/sec: 15.932 %comms: 0.0028158114170018576 %optimizer_step 0.05605761183914351 %forward: 22.65257826793226 %backward: 60.714177687157054 [2025-04-12 04:53:45,304] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33314.46 | forward: 145599.16 | backward_microstep: 390252.25 | backward: 390239.62 | backward_inner_microstep: 390221.28 | backward_inner: 390214.39 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.05 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.22 | step: 360.31 | _step_clipping: 0.14 | _step_step: 358.40 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 15.931 | iteration 24970/ 143000 | elapsed time per iteration (ms): 64275.5 | learning rate: 5.609E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.330511E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 05:04:29,064] [INFO] [logging.py:60:log_dist] [Rank 0] step=24980, skipped=29, lr=[0.00056083075287314, 0.00056083075287314], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24980 loss: 2.3173 iter time (s): 64.375 samples/sec: 15.907 %comms: 0.0028117097787098482 %optimizer_step 0.05693331760547643 %forward: 22.594770416174633 %backward: 60.594638571470284 [2025-04-12 05:04:29,065] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34612.01 | forward: 145454.79 | backward_microstep: 390091.36 | backward: 390080.54 | backward_inner_microstep: 390062.38 | backward_inner: 390055.62 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.01 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.22 | step: 366.51 | _step_clipping: 0.12 | _step_step: 364.72 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 15.907 | iteration 24980/ 143000 | elapsed time per iteration (ms): 64376.0 | learning rate: 5.608E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.332717E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 05:15:05,845] [INFO] [logging.py:60:log_dist] [Rank 0] step=24990, skipped=29, lr=[0.0005607981852323681, 0.0005607981852323681], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 24990 loss: 2.3140 iter time (s): 63.678 samples/sec: 16.081 %comms: 0.0028295703574573285 %optimizer_step 0.055383804794614244 %forward: 22.83032543373437 %backward: 61.257925911201916 [2025-04-12 05:15:05,846] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27730.45 | forward: 145377.89 | backward_microstep: 390085.63 | backward: 390075.39 | backward_inner_microstep: 390058.64 | backward_inner: 390052.29 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.73 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.20 | step: 352.67 | _step_clipping: 0.14 | _step_step: 350.97 | _step_zero_grad: 0.51 | _step_check_overflow: 0.48 samples/sec: 16.081 | iteration 24990/ 143000 | elapsed time per iteration (ms): 63678.1 | learning rate: 5.608E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.322614E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 05:25:44,086] [INFO] [logging.py:60:log_dist] [Rank 0] step=25000, skipped=29, lr=[0.0005607656050043064, 0.0005607656050043064], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25000 loss: 2.3568 iter time (s): 63.823 samples/sec: 16.044 %comms: 0.002839612681253222 %optimizer_step 0.05686270695028766 %forward: 22.78939986418997 %backward: 61.14942301216416 [2025-04-12 05:25:44,087] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28876.23 | forward: 145449.82 | backward_microstep: 390287.39 | backward: 390276.72 | backward_inner_microstep: 390258.02 | backward_inner: 390251.26 | backward_allreduce_microstep: 9.22 | backward_allreduce: 3.26 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.22 | step: 362.92 | _step_clipping: 0.12 | _step_step: 361.00 | _step_zero_grad: 0.57 | _step_check_overflow: 0.57 samples/sec: 16.044 | iteration 25000/ 143000 | elapsed time per iteration (ms): 63824.1 | learning rate: 5.608E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.335706E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 05:25:46,917] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step25000/mp_rank_00_model_states.pt [2025-04-12 05:26:00,540] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-12 05:26:00,552] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step25000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-12 05:36:37,927] [INFO] [logging.py:60:log_dist] [Rank 0] step=25010, skipped=29, lr=[0.0005607330121905272, 0.0005607330121905272], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25010 loss: 2.3422 iter time (s): 63.736 samples/sec: 16.066 %comms: 0.002829575826316268 %optimizer_step 0.05677047386157691 %forward: 22.827600958803128 %backward: 61.222909170003994 [2025-04-12 05:36:37,928] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28013.92 | forward: 145492.98 | backward_microstep: 390216.62 | backward: 390207.60 | backward_inner_microstep: 390190.15 | backward_inner: 390183.72 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.96 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.21 | step: 361.83 | _step_clipping: 0.14 | _step_step: 359.85 | _step_zero_grad: 0.53 | _step_check_overflow: 0.72 samples/sec: 15.661 | iteration 25010/ 143000 | elapsed time per iteration (ms): 65384.2 | learning rate: 5.607E-04 | approx flops per GPU: 67.6TFLOPS | lm_loss: 2.335178E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 05:47:14,521] [INFO] [logging.py:60:log_dist] [Rank 0] step=25020, skipped=29, lr=[0.0005607004067926037, 0.0005607004067926037], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25020 loss: 2.3368 iter time (s): 63.659 samples/sec: 16.086 %comms: 0.0028489846722029167 %optimizer_step 0.0564527722888805 %forward: 22.846433922882103 %backward: 61.32079264978958 [2025-04-12 05:47:14,521] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27108.01 | forward: 145437.40 | backward_microstep: 390373.05 | backward: 390360.13 | backward_inner_microstep: 390342.05 | backward_inner: 390335.27 | backward_allreduce_microstep: 8.66 | backward_allreduce: 3.02 | reduce_tied_grads: 0.32 | comms: 18.14 | reduce_grads: 0.21 | step: 359.37 | _step_clipping: 0.12 | _step_step: 357.48 | _step_zero_grad: 0.57 | _step_check_overflow: 0.58 samples/sec: 16.086 | iteration 25020/ 143000 | elapsed time per iteration (ms): 63659.3 | learning rate: 5.607E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.330859E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 05:57:45,811] [INFO] [logging.py:60:log_dist] [Rank 0] step=25030, skipped=29, lr=[0.0005606677888121096, 0.0005606677888121096], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25030 loss: 2.3215 iter time (s): 63.128 samples/sec: 16.221 %comms: 0.002868874038497453 %optimizer_step 0.05638786614117746 %forward: 23.00083027961107 %backward: 61.81074069541433 [2025-04-12 05:57:45,811] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22231.90 | forward: 145200.64 | backward_microstep: 390211.81 | backward: 390201.52 | backward_inner_microstep: 390184.14 | backward_inner: 390177.62 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.88 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 355.97 | _step_clipping: 0.11 | _step_step: 354.02 | _step_zero_grad: 0.54 | _step_check_overflow: 0.69 samples/sec: 16.221 | iteration 25030/ 143000 | elapsed time per iteration (ms): 63129.0 | learning rate: 5.607E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.322105E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 06:08:32,845] [INFO] [logging.py:60:log_dist] [Rank 0] step=25040, skipped=29, lr=[0.0005606351582506192, 0.0005606351582506192], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25040 loss: 2.3338 iter time (s): 64.703 samples/sec: 15.826 %comms: 0.002807504156787169 %optimizer_step 0.05819766606528032 %forward: 22.497950202491314 %backward: 60.327721543590584 [2025-04-12 06:08:32,846] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37436.36 | forward: 145568.13 | backward_microstep: 390348.09 | backward: 390337.51 | backward_inner_microstep: 390316.82 | backward_inner: 390309.86 | backward_allreduce_microstep: 9.18 | backward_allreduce: 3.18 | reduce_tied_grads: 0.36 | comms: 18.17 | reduce_grads: 0.23 | step: 376.56 | _step_clipping: 0.13 | _step_step: 374.17 | _step_zero_grad: 0.60 | _step_check_overflow: 0.98 samples/sec: 15.826 | iteration 25040/ 143000 | elapsed time per iteration (ms): 64703.5 | learning rate: 5.606E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.332415E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 06:19:19,521] [INFO] [logging.py:60:log_dist] [Rank 0] step=25050, skipped=29, lr=[0.0005606025151097072, 0.0005606025151097072], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25050 loss: 2.3225 iter time (s): 64.667 samples/sec: 15.835 %comms: 0.0028226363241430712 %optimizer_step 0.05834612227468691 %forward: 22.544045477695292 %backward: 60.36971936098266 [2025-04-12 06:19:19,522] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36792.25 | forward: 145785.15 | backward_microstep: 390406.03 | backward: 390391.70 | backward_inner_microstep: 390370.99 | backward_inner: 390363.57 | backward_allreduce_microstep: 10.11 | backward_allreduce: 3.51 | reduce_tied_grads: 0.39 | comms: 18.25 | reduce_grads: 0.25 | step: 377.31 | _step_clipping: 0.14 | _step_step: 375.17 | _step_zero_grad: 0.80 | _step_check_overflow: 0.53 samples/sec: 15.835 | iteration 25050/ 143000 | elapsed time per iteration (ms): 64667.5 | learning rate: 5.606E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.320563E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 06:29:57,066] [INFO] [logging.py:60:log_dist] [Rank 0] step=25060, skipped=29, lr=[0.0005605698593909493, 0.0005605698593909493], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25060 loss: 2.3351 iter time (s): 63.754 samples/sec: 16.062 %comms: 0.0028268885644847194 %optimizer_step 0.05627210539433377 %forward: 22.831915147216524 %backward: 61.22414012185352 [2025-04-12 06:29:57,067] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27960.61 | forward: 145562.53 | backward_microstep: 390339.42 | backward: 390328.21 | backward_inner_microstep: 390310.32 | backward_inner: 390303.59 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.96 | reduce_tied_grads: 0.31 | comms: 18.02 | reduce_grads: 0.20 | step: 358.76 | _step_clipping: 0.13 | _step_step: 356.82 | _step_zero_grad: 0.53 | _step_check_overflow: 0.68 samples/sec: 16.062 | iteration 25060/ 143000 | elapsed time per iteration (ms): 63754.5 | learning rate: 5.606E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.326790E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 06:40:26,865] [INFO] [logging.py:60:log_dist] [Rank 0] step=25070, skipped=29, lr=[0.0005605371910959216, 0.0005605371910959216], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25070 loss: 2.3356 iter time (s): 62.979 samples/sec: 16.259 %comms: 0.0028849808739903737 %optimizer_step 0.060462775599592226 %forward: 23.098996219043432 %backward: 61.95560026094782 [2025-04-12 06:40:26,866] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20440.27 | forward: 145475.83 | backward_microstep: 390201.30 | backward: 390191.94 | backward_inner_microstep: 390174.48 | backward_inner: 390167.88 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.17 | reduce_grads: 0.20 | step: 380.79 | _step_clipping: 0.12 | _step_step: 378.93 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 16.259 | iteration 25070/ 143000 | elapsed time per iteration (ms): 62979.9 | learning rate: 5.605E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.322607E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 06:51:19,181] [INFO] [logging.py:60:log_dist] [Rank 0] step=25080, skipped=29, lr=[0.0005605045102262007, 0.0005605045102262007], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25080 loss: 2.3351 iter time (s): 65.231 samples/sec: 15.698 %comms: 0.002779220323409893 %optimizer_step 0.05872777376989695 %forward: 22.385766556540414 %backward: 59.825382865231305 [2025-04-12 06:51:19,182] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 42357.78 | forward: 146024.42 | backward_microstep: 390256.81 | backward: 390246.49 | backward_inner_microstep: 390227.07 | backward_inner: 390219.83 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.42 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.27 | step: 383.09 | _step_clipping: 0.13 | _step_step: 379.22 | _step_zero_grad: 2.36 | _step_check_overflow: 0.69 samples/sec: 15.698 | iteration 25080/ 143000 | elapsed time per iteration (ms): 65231.6 | learning rate: 5.605E-04 | approx flops per GPU: 67.7TFLOPS | lm_loss: 2.328053E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 07:02:00,278] [INFO] [logging.py:60:log_dist] [Rank 0] step=25090, skipped=29, lr=[0.000560471816783364, 0.000560471816783364], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25090 loss: 2.3353 iter time (s): 64.109 samples/sec: 15.973 %comms: 0.0028034978330277144 %optimizer_step 0.05589598389626872 %forward: 22.692708369343286 %backward: 60.874461373413816 [2025-04-12 07:02:00,278] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31679.72 | forward: 145480.70 | backward_microstep: 390269.88 | backward: 390260.13 | backward_inner_microstep: 390239.99 | backward_inner: 390233.33 | backward_allreduce_microstep: 10.78 | backward_allreduce: 4.90 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.22 | step: 358.34 | _step_clipping: 0.13 | _step_step: 356.53 | _step_zero_grad: 0.56 | _step_check_overflow: 0.51 samples/sec: 15.973 | iteration 25090/ 143000 | elapsed time per iteration (ms): 64109.6 | learning rate: 5.605E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.327457E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 07:12:42,181] [INFO] [logging.py:60:log_dist] [Rank 0] step=25100, skipped=29, lr=[0.0005604391107689895, 0.0005604391107689895], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25100 loss: 2.3337 iter time (s): 64.190 samples/sec: 15.953 %comms: 0.002801456789027369 %optimizer_step 0.0560601128563064 %forward: 22.725998571331324 %backward: 60.79952585046704 [2025-04-12 07:12:42,182] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32038.29 | forward: 145877.63 | backward_microstep: 390282.49 | backward: 390270.68 | backward_inner_microstep: 390251.23 | backward_inner: 390244.21 | backward_allreduce_microstep: 9.59 | backward_allreduce: 3.23 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.23 | step: 359.85 | _step_clipping: 0.13 | _step_step: 357.93 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 15.953 | iteration 25100/ 143000 | elapsed time per iteration (ms): 64190.4 | learning rate: 5.604E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.333870E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 07:23:18,503] [INFO] [logging.py:60:log_dist] [Rank 0] step=25110, skipped=29, lr=[0.0005604063921846557, 0.0005604063921846557], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25110 loss: 2.3122 iter time (s): 63.632 samples/sec: 16.093 %comms: 0.0028440168795189464 %optimizer_step 0.05582087535801597 %forward: 22.84780043661849 %backward: 61.32021531896525 [2025-04-12 07:23:18,504] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27093.49 | forward: 145384.12 | backward_microstep: 390198.96 | backward: 390190.11 | backward_inner_microstep: 390172.82 | backward_inner: 390166.23 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.82 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.23 | step: 355.20 | _step_clipping: 0.13 | _step_step: 353.39 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.092 | iteration 25110/ 143000 | elapsed time per iteration (ms): 63632.2 | learning rate: 5.604E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.325841E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 07:33:59,497] [INFO] [logging.py:60:log_dist] [Rank 0] step=25120, skipped=29, lr=[0.0005603736610319416, 0.0005603736610319416], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25120 loss: 2.3046 iter time (s): 64.099 samples/sec: 15.975 %comms: 0.002823064036375598 %optimizer_step 0.05651785508255856 %forward: 22.70661667314772 %backward: 60.89265027437338 [2025-04-12 07:33:59,498] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31433.58 | forward: 145546.62 | backward_microstep: 390325.14 | backward: 390314.40 | backward_inner_microstep: 390295.28 | backward_inner: 390288.50 | backward_allreduce_microstep: 9.41 | backward_allreduce: 3.24 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.23 | step: 362.27 | _step_clipping: 0.15 | _step_step: 360.25 | _step_zero_grad: 0.60 | _step_check_overflow: 0.59 samples/sec: 15.975 | iteration 25120/ 143000 | elapsed time per iteration (ms): 64099.4 | learning rate: 5.604E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.319701E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 07:45:06,612] [INFO] [logging.py:60:log_dist] [Rank 0] step=25130, skipped=29, lr=[0.0005603409173124271, 0.0005603409173124271], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25130 loss: 2.3367 iter time (s): 66.711 samples/sec: 15.350 %comms: 0.002742226594415365 %optimizer_step 0.057570032576661254 %forward: 21.941550066110647 %backward: 58.56011904779434 [2025-04-12 07:45:06,613] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 56292.08 | forward: 146373.89 | backward_microstep: 390675.31 | backward: 390659.37 | backward_inner_microstep: 390638.66 | backward_inner: 390630.86 | backward_allreduce_microstep: 9.92 | backward_allreduce: 3.43 | reduce_tied_grads: 0.38 | comms: 18.29 | reduce_grads: 0.25 | step: 384.05 | _step_clipping: 0.15 | _step_step: 381.87 | _step_zero_grad: 0.67 | _step_check_overflow: 0.66 samples/sec: 15.350 | iteration 25130/ 143000 | elapsed time per iteration (ms): 66711.5 | learning rate: 5.603E-04 | approx flops per GPU: 66.2TFLOPS | lm_loss: 2.324745E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 07:55:51,714] [INFO] [logging.py:60:log_dist] [Rank 0] step=25140, skipped=29, lr=[0.0005603081610276925, 0.0005603081610276925], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25140 loss: 2.3448 iter time (s): 64.509 samples/sec: 15.874 %comms: 0.002817066245306311 %optimizer_step 0.05656392717312892 %forward: 22.55701735409454 %backward: 60.51042720520593 [2025-04-12 07:55:51,714] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35521.22 | forward: 145514.09 | backward_microstep: 390361.05 | backward: 390349.47 | backward_inner_microstep: 390330.92 | backward_inner: 390322.29 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.05 | reduce_tied_grads: 0.34 | comms: 18.17 | reduce_grads: 0.22 | step: 364.89 | _step_clipping: 0.11 | _step_step: 362.94 | _step_zero_grad: 0.58 | _step_check_overflow: 0.62 samples/sec: 15.873 | iteration 25140/ 143000 | elapsed time per iteration (ms): 64510.1 | learning rate: 5.603E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.330411E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 08:06:23,785] [INFO] [logging.py:60:log_dist] [Rank 0] step=25150, skipped=29, lr=[0.0005602753921793188, 0.0005602753921793188], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25150 loss: 2.2995 iter time (s): 63.206 samples/sec: 16.201 %comms: 0.002886190167321076 %optimizer_step 0.05880853406120493 %forward: 23.672162656493622 %backward: 61.754749138125675 [2025-04-12 08:06:23,785] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18398.22 | forward: 149623.45 | backward_microstep: 390341.84 | backward: 390330.15 | backward_inner_microstep: 390312.30 | backward_inner: 390305.66 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.88 | reduce_tied_grads: 0.36 | comms: 18.24 | reduce_grads: 0.25 | step: 371.71 | _step_clipping: 0.13 | _step_step: 369.82 | _step_zero_grad: 0.52 | _step_check_overflow: 0.58 samples/sec: 16.201 | iteration 25150/ 143000 | elapsed time per iteration (ms): 63207.1 | learning rate: 5.603E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.319130E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 08:17:00,030] [INFO] [logging.py:60:log_dist] [Rank 0] step=25160, skipped=29, lr=[0.0005602426107688876, 0.0005602426107688876], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25160 loss: 2.3255 iter time (s): 63.624 samples/sec: 16.095 %comms: 0.0028300816807897335 %optimizer_step 0.0572395383660308 %forward: 22.87336269555849 %backward: 61.3413959792137 [2025-04-12 08:17:00,030] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26732.53 | forward: 145529.28 | backward_microstep: 390296.41 | backward: 390277.95 | backward_inner_microstep: 390259.75 | backward_inner: 390253.05 | backward_allreduce_microstep: 8.80 | backward_allreduce: 3.04 | reduce_tied_grads: 0.34 | comms: 18.01 | reduce_grads: 0.21 | step: 364.18 | _step_clipping: 0.13 | _step_step: 362.47 | _step_zero_grad: 0.47 | _step_check_overflow: 0.54 samples/sec: 16.094 | iteration 25160/ 143000 | elapsed time per iteration (ms): 63624.5 | learning rate: 5.602E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.326778E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 08:27:43,235] [INFO] [logging.py:60:log_dist] [Rank 0] step=25170, skipped=29, lr=[0.000560209816797981, 0.000560209816797981], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25170 loss: 2.3319 iter time (s): 64.320 samples/sec: 15.920 %comms: 0.002809614740431394 %optimizer_step 0.0563974270430639 %forward: 22.64227274167788 %backward: 60.681219100261345 [2025-04-12 08:27:43,235] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33565.24 | forward: 145634.86 | backward_microstep: 390312.69 | backward: 390300.98 | backward_inner_microstep: 390281.68 | backward_inner: 390274.56 | backward_allreduce_microstep: 9.39 | backward_allreduce: 3.21 | reduce_tied_grads: 0.33 | comms: 18.07 | reduce_grads: 0.22 | step: 362.75 | _step_clipping: 0.13 | _step_step: 360.83 | _step_zero_grad: 0.59 | _step_check_overflow: 0.55 samples/sec: 15.920 | iteration 25170/ 143000 | elapsed time per iteration (ms): 64320.5 | learning rate: 5.602E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.330669E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 08:38:20,957] [INFO] [logging.py:60:log_dist] [Rank 0] step=25180, skipped=29, lr=[0.0005601770102681818, 0.0005601770102681818], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25180 loss: 2.3167 iter time (s): 63.772 samples/sec: 16.057 %comms: 0.00286364387436455 %optimizer_step 0.05745094861076478 %forward: 22.81514921909455 %backward: 61.181139137142715 [2025-04-12 08:38:20,957] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28364.98 | forward: 145495.81 | backward_microstep: 390171.49 | backward: 390161.79 | backward_inner_microstep: 390143.37 | backward_inner: 390136.45 | backward_allreduce_microstep: 9.01 | backward_allreduce: 3.08 | reduce_tied_grads: 0.33 | comms: 18.26 | reduce_grads: 0.24 | step: 366.37 | _step_clipping: 0.14 | _step_step: 364.33 | _step_zero_grad: 0.64 | _step_check_overflow: 0.61 samples/sec: 16.057 | iteration 25180/ 143000 | elapsed time per iteration (ms): 63772.2 | learning rate: 5.602E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.321536E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 08:48:55,885] [INFO] [logging.py:60:log_dist] [Rank 0] step=25190, skipped=29, lr=[0.0005601441911810736, 0.0005601441911810736], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25190 loss: 2.3022 iter time (s): 63.492 samples/sec: 16.128 %comms: 0.0028593450681499303 %optimizer_step 0.057201320879534855 %forward: 22.91148949462086 %backward: 61.46519546367192 [2025-04-12 08:48:55,886] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25497.32 | forward: 145470.17 | backward_microstep: 390266.57 | backward: 390256.27 | backward_inner_microstep: 390235.60 | backward_inner: 390228.84 | backward_allreduce_microstep: 9.01 | backward_allreduce: 3.12 | reduce_tied_grads: 0.34 | comms: 18.15 | reduce_grads: 0.21 | step: 363.18 | _step_clipping: 0.12 | _step_step: 361.33 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.128 | iteration 25190/ 143000 | elapsed time per iteration (ms): 63492.9 | learning rate: 5.601E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.314567E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 09:00:03,569] [INFO] [logging.py:60:log_dist] [Rank 0] step=25200, skipped=29, lr=[0.00056011135953824, 0.00056011135953824], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25200 loss: 2.3013 iter time (s): 66.768 samples/sec: 15.337 %comms: 0.0027042872468860456 %optimizer_step 0.053810824065252526 %forward: 21.863517885705814 %backward: 58.46783766791843 [2025-04-12 09:00:03,570] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 57637.75 | forward: 145977.78 | backward_microstep: 390389.33 | backward: 390376.56 | backward_inner_microstep: 390355.40 | backward_inner: 390348.03 | backward_allreduce_microstep: 11.12 | backward_allreduce: 5.12 | reduce_tied_grads: 0.33 | comms: 18.06 | reduce_grads: 0.22 | step: 359.28 | _step_clipping: 0.16 | _step_step: 357.28 | _step_zero_grad: 0.57 | _step_check_overflow: 0.65 samples/sec: 15.337 | iteration 25200/ 143000 | elapsed time per iteration (ms): 66768.4 | learning rate: 5.601E-04 | approx flops per GPU: 66.2TFLOPS | lm_loss: 2.321443E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 09:10:46,370] [INFO] [logging.py:60:log_dist] [Rank 0] step=25210, skipped=29, lr=[0.000560078515341266, 0.000560078515341266], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25210 loss: 2.3282 iter time (s): 64.279 samples/sec: 15.930 %comms: 0.0028388651312722703 %optimizer_step 0.05560140906594914 %forward: 22.685474562219692 %backward: 60.712307966614475 [2025-04-12 09:10:46,371] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33046.09 | forward: 145821.08 | backward_microstep: 390267.15 | backward: 390255.64 | backward_inner_microstep: 390237.66 | backward_inner: 390230.97 | backward_allreduce_microstep: 8.68 | backward_allreduce: 3.00 | reduce_tied_grads: 0.54 | comms: 18.25 | reduce_grads: 0.22 | step: 357.40 | _step_clipping: 0.12 | _step_step: 355.48 | _step_zero_grad: 0.54 | _step_check_overflow: 0.66 samples/sec: 15.930 | iteration 25210/ 143000 | elapsed time per iteration (ms): 64280.1 | learning rate: 5.601E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.325548E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 09:21:26,421] [INFO] [logging.py:60:log_dist] [Rank 0] step=25220, skipped=29, lr=[0.0005600456585917365, 0.0005600456585917365], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25220 loss: 2.3427 iter time (s): 64.004 samples/sec: 15.999 %comms: 0.0028164601903605244 %optimizer_step 0.05728016747819825 %forward: 22.765529855661462 %backward: 60.96897023458976 [2025-04-12 09:21:26,421] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30400.72 | forward: 145709.46 | backward_microstep: 390238.22 | backward: 390228.36 | backward_inner_microstep: 390209.84 | backward_inner: 390203.04 | backward_allreduce_microstep: 8.96 | backward_allreduce: 3.09 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.22 | step: 366.62 | _step_clipping: 0.13 | _step_step: 364.65 | _step_zero_grad: 0.60 | _step_check_overflow: 0.61 samples/sec: 15.999 | iteration 25220/ 143000 | elapsed time per iteration (ms): 64005.1 | learning rate: 5.600E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.338560E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 09:32:04,464] [INFO] [logging.py:60:log_dist] [Rank 0] step=25230, skipped=29, lr=[0.0005600127892912374, 0.0005600127892912374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25230 loss: 2.3229 iter time (s): 63.804 samples/sec: 16.049 %comms: 0.002829277039839644 %optimizer_step 0.055609614910911716 %forward: 22.79834895192598 %backward: 61.14744400274162 [2025-04-12 09:32:04,465] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28737.18 | forward: 145462.13 | backward_microstep: 390154.31 | backward: 390143.93 | backward_inner_microstep: 390126.05 | backward_inner: 390119.40 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.97 | reduce_tied_grads: 0.29 | comms: 18.05 | reduce_grads: 0.21 | step: 354.81 | _step_clipping: 0.12 | _step_step: 352.89 | _step_zero_grad: 0.57 | _step_check_overflow: 0.63 samples/sec: 16.049 | iteration 25230/ 143000 | elapsed time per iteration (ms): 63804.4 | learning rate: 5.600E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.323885E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 09:42:43,556] [INFO] [logging.py:60:log_dist] [Rank 0] step=25240, skipped=29, lr=[0.0005599799074413552, 0.0005599799074413552], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25240 loss: 2.3313 iter time (s): 63.908 samples/sec: 16.023 %comms: 0.002835200235539115 %optimizer_step 0.057029016743362325 %forward: 22.78850827275171 %backward: 61.04845237636012 [2025-04-12 09:42:43,556] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29613.88 | forward: 145637.90 | backward_microstep: 390160.72 | backward: 390151.41 | backward_inner_microstep: 390132.86 | backward_inner: 390126.09 | backward_allreduce_microstep: 9.15 | backward_allreduce: 3.18 | reduce_tied_grads: 0.33 | comms: 18.12 | reduce_grads: 0.22 | step: 364.46 | _step_clipping: 0.12 | _step_step: 362.36 | _step_zero_grad: 0.57 | _step_check_overflow: 0.78 samples/sec: 16.023 | iteration 25240/ 143000 | elapsed time per iteration (ms): 63909.1 | learning rate: 5.600E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.331207E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 09:53:21,140] [INFO] [logging.py:60:log_dist] [Rank 0] step=25250, skipped=29, lr=[0.0005599470130436769, 0.0005599470130436769], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25250 loss: 2.3165 iter time (s): 63.758 samples/sec: 16.061 %comms: 0.0031822999478470543 %optimizer_step 0.058405661384040145 %forward: 22.842192582324657 %backward: 61.215127525912905 [2025-04-12 09:53:21,141] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27904.60 | forward: 145636.90 | backward_microstep: 390305.41 | backward: 390294.48 | backward_inner_microstep: 390275.95 | backward_inner: 390269.22 | backward_allreduce_microstep: 9.03 | backward_allreduce: 3.11 | reduce_tied_grads: 0.38 | comms: 20.29 | reduce_grads: 0.26 | step: 372.38 | _step_clipping: 0.15 | _step_step: 370.35 | _step_zero_grad: 0.56 | _step_check_overflow: 0.62 samples/sec: 16.061 | iteration 25250/ 143000 | elapsed time per iteration (ms): 63758.4 | learning rate: 5.599E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.326911E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 10:03:58,615] [INFO] [logging.py:60:log_dist] [Rank 0] step=25260, skipped=29, lr=[0.0005599141060997901, 0.0005599141060997901], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25260 loss: 2.3122 iter time (s): 63.747 samples/sec: 16.064 %comms: 0.0028209216721186304 %optimizer_step 0.056704213333344425 %forward: 22.839263789246548 %backward: 61.217220984326616 [2025-04-12 10:03:58,615] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27911.52 | forward: 145593.08 | backward_microstep: 390252.02 | backward: 390240.41 | backward_inner_microstep: 390222.25 | backward_inner: 390213.95 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.02 | reduce_tied_grads: 0.33 | comms: 17.98 | reduce_grads: 0.21 | step: 361.47 | _step_clipping: 0.13 | _step_step: 359.62 | _step_zero_grad: 0.56 | _step_check_overflow: 0.55 samples/sec: 16.063 | iteration 25260/ 143000 | elapsed time per iteration (ms): 63747.5 | learning rate: 5.599E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.321000E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 10:14:43,457] [INFO] [logging.py:60:log_dist] [Rank 0] step=25270, skipped=29, lr=[0.0005598811866112831, 0.0005598811866112831], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25270 loss: 2.3393 iter time (s): 64.484 samples/sec: 15.880 %comms: 0.0027879125592912583 %optimizer_step 0.05508721919719637 %forward: 22.559182249309156 %backward: 60.51218642323276 [2025-04-12 10:14:43,458] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35476.53 | forward: 145469.83 | backward_microstep: 390214.92 | backward: 390204.63 | backward_inner_microstep: 390186.26 | backward_inner: 390179.58 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.09 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.20 | step: 355.22 | _step_clipping: 0.11 | _step_step: 353.48 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 15.880 | iteration 25270/ 143000 | elapsed time per iteration (ms): 64484.2 | learning rate: 5.599E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.328046E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 10:25:28,563] [INFO] [logging.py:60:log_dist] [Rank 0] step=25280, skipped=29, lr=[0.0005598482545797446, 0.0005598482545797446], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25280 loss: 2.3313 iter time (s): 64.510 samples/sec: 15.874 %comms: 0.002806069531093722 %optimizer_step 0.05700876275808202 %forward: 22.56717602396445 %backward: 60.495705648475194 [2025-04-12 10:25:28,563] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35474.64 | forward: 145580.66 | backward_microstep: 390268.01 | backward: 390257.28 | backward_inner_microstep: 390238.02 | backward_inner: 390231.10 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.42 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.23 | step: 367.76 | _step_clipping: 0.14 | _step_step: 365.81 | _step_zero_grad: 0.61 | _step_check_overflow: 0.57 samples/sec: 15.873 | iteration 25280/ 143000 | elapsed time per iteration (ms): 64510.6 | learning rate: 5.598E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.328128E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 10:36:18,927] [INFO] [logging.py:60:log_dist] [Rank 0] step=25290, skipped=29, lr=[0.0005598153100067641, 0.0005598153100067641], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25290 loss: 2.3234 iter time (s): 65.036 samples/sec: 15.745 %comms: 0.002784733867076997 %optimizer_step 0.056295930865063945 %forward: 22.395159805095787 %backward: 59.99726460257533 [2025-04-12 10:36:18,928] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40800.47 | forward: 145648.81 | backward_microstep: 390208.27 | backward: 390197.28 | backward_inner_microstep: 390178.16 | backward_inner: 390170.99 | backward_allreduce_microstep: 9.34 | backward_allreduce: 3.25 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.25 | step: 366.13 | _step_clipping: 0.14 | _step_step: 364.02 | _step_zero_grad: 0.63 | _step_check_overflow: 0.67 samples/sec: 15.745 | iteration 25290/ 143000 | elapsed time per iteration (ms): 65036.5 | learning rate: 5.598E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.316026E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 10:46:53,097] [INFO] [logging.py:60:log_dist] [Rank 0] step=25300, skipped=29, lr=[0.0005597823528939318, 0.0005597823528939318], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25300 loss: 2.3331 iter time (s): 63.416 samples/sec: 16.147 %comms: 0.002824531160009401 %optimizer_step 0.05589691110394198 %forward: 22.95518613877272 %backward: 61.519356722688535 [2025-04-12 10:46:53,098] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24753.04 | forward: 145573.44 | backward_microstep: 390142.34 | backward: 390133.39 | backward_inner_microstep: 390114.95 | backward_inner: 390108.26 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.10 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.22 | step: 354.48 | _step_clipping: 0.12 | _step_step: 352.74 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.147 | iteration 25300/ 143000 | elapsed time per iteration (ms): 63416.9 | learning rate: 5.598E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.319038E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 10:57:31,832] [INFO] [logging.py:60:log_dist] [Rank 0] step=25310, skipped=29, lr=[0.0005597493832428382, 0.0005597493832428382], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25310 loss: 2.3421 iter time (s): 63.872 samples/sec: 16.032 %comms: 0.0028153410535350456 %optimizer_step 0.05494907588048992 %forward: 22.796026887442856 %backward: 61.08706480449931 [2025-04-12 10:57:31,833] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29249.00 | forward: 145603.58 | backward_microstep: 390186.85 | backward: 390177.43 | backward_inner_microstep: 390160.51 | backward_inner: 390154.15 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.80 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.20 | step: 350.97 | _step_clipping: 0.11 | _step_step: 349.23 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.032 | iteration 25310/ 143000 | elapsed time per iteration (ms): 63873.5 | learning rate: 5.597E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.335751E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 11:08:02,642] [INFO] [logging.py:60:log_dist] [Rank 0] step=25320, skipped=29, lr=[0.0005597164010550746, 0.0005597164010550746], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25320 loss: 2.3225 iter time (s): 63.080 samples/sec: 16.233 %comms: 0.0028762372305530786 %optimizer_step 0.05616979607720307 %forward: 23.059103124154372 %backward: 61.868897532286425 [2025-04-12 11:08:02,642] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21332.33 | forward: 145457.71 | backward_microstep: 390281.33 | backward: 390271.39 | backward_inner_microstep: 390254.45 | backward_inner: 390247.99 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.82 | reduce_tied_grads: 0.35 | comms: 18.14 | reduce_grads: 0.21 | step: 354.32 | _step_clipping: 0.13 | _step_step: 352.55 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.233 | iteration 25320/ 143000 | elapsed time per iteration (ms): 63081.0 | learning rate: 5.597E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.326330E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 11:18:42,894] [INFO] [logging.py:60:log_dist] [Rank 0] step=25330, skipped=29, lr=[0.0005596834063322329, 0.0005596834063322329], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25330 loss: 2.3178 iter time (s): 64.025 samples/sec: 15.994 %comms: 0.0028474862940191174 %optimizer_step 0.05850231456105793 %forward: 22.707586148711894 %backward: 60.958072821752864 [2025-04-12 11:18:42,895] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30822.44 | forward: 145384.40 | backward_microstep: 390295.89 | backward: 390281.58 | backward_inner_microstep: 390263.19 | backward_inner: 390256.33 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.05 | reduce_tied_grads: 0.40 | comms: 18.23 | reduce_grads: 0.23 | step: 374.56 | _step_clipping: 0.14 | _step_step: 372.39 | _step_zero_grad: 0.70 | _step_check_overflow: 0.61 samples/sec: 15.994 | iteration 25330/ 143000 | elapsed time per iteration (ms): 64025.2 | learning rate: 5.597E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.317061E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 11:29:24,702] [INFO] [logging.py:60:log_dist] [Rank 0] step=25340, skipped=29, lr=[0.0005596503990759056, 0.0005596503990759056], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25340 loss: 2.3315 iter time (s): 64.180 samples/sec: 15.955 %comms: 0.002824381648621287 %optimizer_step 0.05718509141584103 %forward: 22.662669546877186 %backward: 60.81647200923462 [2025-04-12 11:29:24,703] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32269.76 | forward: 145449.68 | backward_microstep: 390333.76 | backward: 390321.91 | backward_inner_microstep: 390299.86 | backward_inner: 390291.10 | backward_allreduce_microstep: 10.32 | backward_allreduce: 4.07 | reduce_tied_grads: 0.35 | comms: 18.13 | reduce_grads: 0.23 | step: 367.02 | _step_clipping: 0.14 | _step_step: 364.99 | _step_zero_grad: 0.52 | _step_check_overflow: 0.73 samples/sec: 15.955 | iteration 25340/ 143000 | elapsed time per iteration (ms): 64180.9 | learning rate: 5.597E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.325426E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 11:40:12,909] [INFO] [logging.py:60:log_dist] [Rank 0] step=25350, skipped=29, lr=[0.0005596173792876858, 0.0005596173792876858], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25350 loss: 2.3235 iter time (s): 64.820 samples/sec: 15.798 %comms: 0.003304977986319344 %optimizer_step 0.05553612857334961 %forward: 22.467070199106224 %backward: 60.20699988503934 [2025-04-12 11:40:12,909] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38562.68 | forward: 145631.52 | backward_microstep: 390274.78 | backward: 390261.69 | backward_inner_microstep: 390241.96 | backward_inner: 390235.17 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.96 | reduce_tied_grads: 0.31 | comms: 21.42 | reduce_grads: 0.20 | step: 359.99 | _step_clipping: 0.12 | _step_step: 358.14 | _step_zero_grad: 0.51 | _step_check_overflow: 0.63 samples/sec: 15.797 | iteration 25350/ 143000 | elapsed time per iteration (ms): 64820.6 | learning rate: 5.596E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.327548E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 11:50:52,254] [INFO] [logging.py:60:log_dist] [Rank 0] step=25360, skipped=29, lr=[0.000559584346969167, 0.000559584346969167], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25360 loss: 2.3161 iter time (s): 63.934 samples/sec: 16.017 %comms: 0.0028160966707409746 %optimizer_step 0.05754464353771074 %forward: 22.773298870419907 %backward: 61.040426336361605 [2025-04-12 11:50:52,255] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29723.83 | forward: 145598.69 | backward_microstep: 390266.41 | backward: 390255.54 | backward_inner_microstep: 390236.28 | backward_inner: 390229.81 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.84 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.21 | step: 367.91 | _step_clipping: 0.13 | _step_step: 366.10 | _step_zero_grad: 0.54 | _step_check_overflow: 0.53 samples/sec: 16.016 | iteration 25360/ 143000 | elapsed time per iteration (ms): 63934.5 | learning rate: 5.596E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.324552E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 12:01:40,303] [INFO] [logging.py:60:log_dist] [Rank 0] step=25370, skipped=29, lr=[0.0005595513021219435, 0.0005595513021219435], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25370 loss: 2.3205 iter time (s): 64.804 samples/sec: 15.801 %comms: 0.0028495392532731263 %optimizer_step 0.05909969355288229 %forward: 22.485530515520985 %backward: 60.24417565846597 [2025-04-12 12:01:40,304] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38113.66 | forward: 145715.86 | backward_microstep: 390422.97 | backward: 390408.05 | backward_inner_microstep: 390388.16 | backward_inner: 390380.83 | backward_allreduce_microstep: 9.54 | backward_allreduce: 3.28 | reduce_tied_grads: 0.39 | comms: 18.47 | reduce_grads: 0.25 | step: 382.99 | _step_clipping: 0.17 | _step_step: 380.67 | _step_zero_grad: 0.68 | _step_check_overflow: 0.75 samples/sec: 15.801 | iteration 25370/ 143000 | elapsed time per iteration (ms): 64805.0 | learning rate: 5.596E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.322142E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 12:12:25,173] [INFO] [logging.py:60:log_dist] [Rank 0] step=25380, skipped=29, lr=[0.0005595182447476105, 0.0005595182447476105], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25380 loss: 2.3007 iter time (s): 64.486 samples/sec: 15.879 %comms: 0.0028568639519419158 %optimizer_step 0.05977822646997242 %forward: 22.60303822813573 %backward: 60.52779143081079 [2025-04-12 12:12:25,173] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34988.55 | forward: 145758.50 | backward_microstep: 390333.91 | backward: 390320.98 | backward_inner_microstep: 390300.72 | backward_inner: 390293.23 | backward_allreduce_microstep: 9.72 | backward_allreduce: 3.35 | reduce_tied_grads: 0.56 | comms: 18.42 | reduce_grads: 0.25 | step: 385.49 | _step_clipping: 0.13 | _step_step: 383.44 | _step_zero_grad: 0.64 | _step_check_overflow: 0.59 samples/sec: 15.879 | iteration 25380/ 143000 | elapsed time per iteration (ms): 64486.9 | learning rate: 5.595E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.322795E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 12:23:03,958] [INFO] [logging.py:60:log_dist] [Rank 0] step=25390, skipped=29, lr=[0.0005594851748477632, 0.0005594851748477632], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25390 loss: 2.3171 iter time (s): 63.878 samples/sec: 16.031 %comms: 0.0028330838210291523 %optimizer_step 0.055929975278004085 %forward: 22.76849501894298 %backward: 61.10900978649233 [2025-04-12 12:23:03,959] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29233.99 | forward: 145440.51 | backward_microstep: 390364.78 | backward: 390351.90 | backward_inner_microstep: 390330.16 | backward_inner: 390322.97 | backward_allreduce_microstep: 9.82 | backward_allreduce: 3.34 | reduce_tied_grads: 0.30 | comms: 18.10 | reduce_grads: 0.21 | step: 357.27 | _step_clipping: 0.12 | _step_step: 355.39 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.030 | iteration 25390/ 143000 | elapsed time per iteration (ms): 63878.5 | learning rate: 5.595E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.328875E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 12:33:42,797] [INFO] [logging.py:60:log_dist] [Rank 0] step=25400, skipped=29, lr=[0.0005594520924239977, 0.0005594520924239977], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25400 loss: 2.3169 iter time (s): 63.883 samples/sec: 16.029 %comms: 0.0028269508921641225 %optimizer_step 0.05662653549866319 %forward: 22.783601139535854 %backward: 61.088467193950905 [2025-04-12 12:33:42,798] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29256.24 | forward: 145549.14 | backward_microstep: 390266.70 | backward: 390253.22 | backward_inner_microstep: 390234.76 | backward_inner: 390227.87 | backward_allreduce_microstep: 8.82 | backward_allreduce: 3.04 | reduce_tied_grads: 0.34 | comms: 18.06 | reduce_grads: 0.22 | step: 361.75 | _step_clipping: 0.12 | _step_step: 359.93 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 16.029 | iteration 25400/ 143000 | elapsed time per iteration (ms): 63883.9 | learning rate: 5.595E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.331092E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 12:44:19,376] [INFO] [logging.py:60:log_dist] [Rank 0] step=25410, skipped=29, lr=[0.0005594189974779109, 0.0005594189974779109], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25410 loss: 2.3214 iter time (s): 63.657 samples/sec: 16.086 %comms: 0.0028391599853213427 %optimizer_step 0.05639896459938362 %forward: 22.897957417444452 %backward: 61.32346394205975 [2025-04-12 12:44:19,377] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26682.63 | forward: 145762.17 | backward_microstep: 390380.92 | backward: 390368.50 | backward_inner_microstep: 390350.17 | backward_inner: 390343.28 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.04 | reduce_tied_grads: 0.35 | comms: 18.07 | reduce_grads: 0.22 | step: 359.02 | _step_clipping: 0.12 | _step_step: 357.27 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.086 | iteration 25410/ 143000 | elapsed time per iteration (ms): 63657.9 | learning rate: 5.594E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.315329E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 12:55:05,523] [INFO] [logging.py:60:log_dist] [Rank 0] step=25420, skipped=29, lr=[0.0005593858900111001, 0.0005593858900111001], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25420 loss: 2.3188 iter time (s): 64.614 samples/sec: 15.848 %comms: 0.002792836838913346 %optimizer_step 0.05648184014715845 %forward: 22.571721501254952 %backward: 60.388063573646534 [2025-04-12 12:55:05,524] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36356.90 | forward: 145845.15 | backward_microstep: 390204.10 | backward: 390192.04 | backward_inner_microstep: 390173.27 | backward_inner: 390166.32 | backward_allreduce_microstep: 9.05 | backward_allreduce: 3.12 | reduce_tied_grads: 0.34 | comms: 18.05 | reduce_grads: 0.24 | step: 364.95 | _step_clipping: 0.12 | _step_step: 362.93 | _step_zero_grad: 0.58 | _step_check_overflow: 0.68 samples/sec: 15.848 | iteration 25420/ 143000 | elapsed time per iteration (ms): 64614.7 | learning rate: 5.594E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.331021E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 13:05:37,559] [INFO] [logging.py:60:log_dist] [Rank 0] step=25430, skipped=29, lr=[0.000559352770025163, 0.000559352770025163], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25430 loss: 2.3452 iter time (s): 63.203 samples/sec: 16.202 %comms: 0.002897969952715246 %optimizer_step 0.05700807062092526 %forward: 22.98679366917473 %backward: 61.73830019256496 [2025-04-12 13:05:37,560] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22785.55 | forward: 145283.36 | backward_microstep: 390216.95 | backward: 390204.39 | backward_inner_microstep: 390186.35 | backward_inner: 390179.77 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.94 | reduce_tied_grads: 0.34 | comms: 18.32 | reduce_grads: 0.23 | step: 360.31 | _step_clipping: 0.14 | _step_step: 358.33 | _step_zero_grad: 0.55 | _step_check_overflow: 0.65 samples/sec: 16.202 | iteration 25430/ 143000 | elapsed time per iteration (ms): 63203.6 | learning rate: 5.594E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.334439E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 13:16:28,033] [INFO] [logging.py:60:log_dist] [Rank 0] step=25440, skipped=29, lr=[0.0005593196375216984, 0.0005593196375216984], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25440 loss: 2.3535 iter time (s): 65.047 samples/sec: 15.743 %comms: 0.0027881209147375533 %optimizer_step 0.05773437597959312 %forward: 22.428726539170878 %backward: 60.01438193456812 [2025-04-12 13:16:28,033] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40369.89 | forward: 145891.30 | backward_microstep: 390388.28 | backward: 390373.31 | backward_inner_microstep: 390353.65 | backward_inner: 390344.58 | backward_allreduce_microstep: 9.42 | backward_allreduce: 3.39 | reduce_tied_grads: 0.36 | comms: 18.14 | reduce_grads: 0.24 | step: 375.54 | _step_clipping: 0.17 | _step_step: 373.52 | _step_zero_grad: 0.60 | _step_check_overflow: 0.51 samples/sec: 15.742 | iteration 25440/ 143000 | elapsed time per iteration (ms): 65047.3 | learning rate: 5.593E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.337019E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 13:27:08,381] [INFO] [logging.py:60:log_dist] [Rank 0] step=25450, skipped=29, lr=[0.0005592864925023051, 0.0005592864925023051], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25450 loss: 2.3125 iter time (s): 64.034 samples/sec: 15.991 %comms: 0.0028492882432283595 %optimizer_step 0.05583053832296503 %forward: 22.705965537250684 %backward: 60.9451247268333 [2025-04-12 13:27:08,382] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30905.23 | forward: 145396.08 | backward_microstep: 390269.64 | backward: 390257.89 | backward_inner_microstep: 390240.24 | backward_inner: 390233.42 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.89 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.21 | step: 357.51 | _step_clipping: 0.14 | _step_step: 355.80 | _step_zero_grad: 0.47 | _step_check_overflow: 0.53 samples/sec: 15.991 | iteration 25450/ 143000 | elapsed time per iteration (ms): 64034.9 | learning rate: 5.593E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.329177E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 13:37:44,027] [INFO] [logging.py:60:log_dist] [Rank 0] step=25460, skipped=29, lr=[0.000559253334968583, 0.000559253334968583], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25460 loss: 2.3174 iter time (s): 63.564 samples/sec: 16.110 %comms: 0.002859156458516893 %optimizer_step 0.05697634509703308 %forward: 22.879600685603716 %backward: 61.387666169541454 [2025-04-12 13:37:44,028] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26208.65 | forward: 145431.82 | backward_microstep: 390214.80 | backward: 390204.37 | backward_inner_microstep: 390184.38 | backward_inner: 390177.60 | backward_allreduce_microstep: 8.87 | backward_allreduce: 3.03 | reduce_tied_grads: 0.33 | comms: 18.17 | reduce_grads: 0.21 | step: 362.16 | _step_clipping: 0.12 | _step_step: 360.25 | _step_zero_grad: 0.52 | _step_check_overflow: 0.68 samples/sec: 16.110 | iteration 25460/ 143000 | elapsed time per iteration (ms): 63564.6 | learning rate: 5.593E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.317772E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 13:48:31,702] [INFO] [logging.py:60:log_dist] [Rank 0] step=25470, skipped=29, lr=[0.0005592201649221325, 0.0005592201649221325], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25470 loss: 2.3434 iter time (s): 64.767 samples/sec: 15.811 %comms: 0.0027781886079961994 %optimizer_step 0.057122421740567524 %forward: 22.53113077764989 %backward: 60.240357488784454 [2025-04-12 13:48:31,702] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37786.75 | forward: 145927.02 | backward_microstep: 390167.45 | backward: 390157.77 | backward_inner_microstep: 390139.69 | backward_inner: 390132.90 | backward_allreduce_microstep: 8.71 | backward_allreduce: 2.98 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.23 | step: 369.96 | _step_clipping: 0.13 | _step_step: 368.20 | _step_zero_grad: 0.55 | _step_check_overflow: 0.48 samples/sec: 15.810 | iteration 25470/ 143000 | elapsed time per iteration (ms): 64767.4 | learning rate: 5.592E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.331667E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 13:59:05,787] [INFO] [logging.py:60:log_dist] [Rank 0] step=25480, skipped=29, lr=[0.0005591869823645544, 0.0005591869823645544], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25480 loss: 2.3271 iter time (s): 63.408 samples/sec: 16.149 %comms: 0.0028282920245946415 %optimizer_step 0.0555708490195943 %forward: 22.944068987154303 %backward: 61.54801802769182 [2025-04-12 13:59:05,787] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24534.51 | forward: 145483.53 | backward_microstep: 390275.28 | backward: 390263.09 | backward_inner_microstep: 390245.18 | backward_inner: 390238.53 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.96 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.20 | step: 352.36 | _step_clipping: 0.12 | _step_step: 350.54 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.149 | iteration 25480/ 143000 | elapsed time per iteration (ms): 63408.5 | learning rate: 5.592E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.326337E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 14:09:47,091] [INFO] [logging.py:60:log_dist] [Rank 0] step=25490, skipped=29, lr=[0.0005591537872974504, 0.0005591537872974504], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25490 loss: 2.3165 iter time (s): 64.130 samples/sec: 15.968 %comms: 0.002840058612724564 %optimizer_step 0.05667473179813622 %forward: 22.666991057632004 %backward: 60.83560135948392 [2025-04-12 14:09:47,092] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32011.10 | forward: 145363.22 | backward_microstep: 390147.91 | backward: 390138.20 | backward_inner_microstep: 390119.67 | backward_inner: 390112.96 | backward_allreduce_microstep: 8.94 | backward_allreduce: 3.07 | reduce_tied_grads: 0.37 | comms: 18.21 | reduce_grads: 0.24 | step: 363.45 | _step_clipping: 0.14 | _step_step: 361.62 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 15.967 | iteration 25490/ 143000 | elapsed time per iteration (ms): 64130.5 | learning rate: 5.592E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.318824E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 14:20:16,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=25500, skipped=29, lr=[0.0005591205797224224, 0.0005591205797224224], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25500 loss: 2.3204 iter time (s): 62.956 samples/sec: 16.265 %comms: 0.0028563999340160064 %optimizer_step 0.056963677575730204 %forward: 23.092892337126223 %backward: 61.98408666781771 [2025-04-12 14:20:16,657] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20148.00 | forward: 145383.37 | backward_microstep: 390236.93 | backward: 390226.36 | backward_inner_microstep: 390208.99 | backward_inner: 390202.52 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.99 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.21 | step: 358.62 | _step_clipping: 0.12 | _step_step: 356.84 | _step_zero_grad: 0.48 | _step_check_overflow: 0.58 samples/sec: 16.265 | iteration 25500/ 143000 | elapsed time per iteration (ms): 62956.5 | learning rate: 5.591E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.323078E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 14:30:57,262] [INFO] [logging.py:60:log_dist] [Rank 0] step=25510, skipped=29, lr=[0.0005590873596410734, 0.0005590873596410734], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25510 loss: 2.3114 iter time (s): 64.060 samples/sec: 15.985 %comms: 0.002848184561198363 %optimizer_step 0.05840715549176186 %forward: 22.73722740124825 %backward: 60.91306603206642 [2025-04-12 14:30:57,263] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30953.33 | forward: 145654.59 | backward_microstep: 390219.31 | backward: 390208.85 | backward_inner_microstep: 390190.50 | backward_inner: 390183.71 | backward_allreduce_microstep: 8.94 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 18.25 | reduce_grads: 0.21 | step: 374.16 | _step_clipping: 0.13 | _step_step: 372.27 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 15.985 | iteration 25510/ 143000 | elapsed time per iteration (ms): 64060.6 | learning rate: 5.591E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.326347E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 14:41:48,823] [INFO] [logging.py:60:log_dist] [Rank 0] step=25520, skipped=29, lr=[0.0005590541270550066, 0.0005590541270550066], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25520 loss: 2.3176 iter time (s): 65.156 samples/sec: 15.716 %comms: 0.0027913282595863824 %optimizer_step 0.056193475451800444 %forward: 22.347609553821513 %backward: 59.8944821931769 [2025-04-12 14:41:48,824] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41904.70 | forward: 145607.03 | backward_microstep: 390257.56 | backward: 390245.65 | backward_inner_microstep: 390226.43 | backward_inner: 390219.25 | backward_allreduce_microstep: 9.29 | backward_allreduce: 3.19 | reduce_tied_grads: 0.36 | comms: 18.19 | reduce_grads: 0.24 | step: 366.13 | _step_clipping: 0.13 | _step_step: 364.19 | _step_zero_grad: 0.61 | _step_check_overflow: 0.53 samples/sec: 15.716 | iteration 25520/ 143000 | elapsed time per iteration (ms): 65156.1 | learning rate: 5.591E-04 | approx flops per GPU: 67.8TFLOPS | lm_loss: 2.322265E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 14:52:28,333] [INFO] [logging.py:60:log_dist] [Rank 0] step=25530, skipped=29, lr=[0.000559020881965826, 0.000559020881965826], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25530 loss: 2.3228 iter time (s): 63.950 samples/sec: 16.012 %comms: 0.0028277124806525833 %optimizer_step 0.055669030617697665 %forward: 22.783776680351952 %backward: 61.014242691208786 [2025-04-12 14:52:28,334] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29825.05 | forward: 145703.16 | backward_microstep: 390199.03 | backward: 390188.51 | backward_inner_microstep: 390170.30 | backward_inner: 390163.51 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.08 | reduce_tied_grads: 0.36 | comms: 18.08 | reduce_grads: 0.27 | step: 356.01 | _step_clipping: 0.14 | _step_step: 354.18 | _step_zero_grad: 0.54 | _step_check_overflow: 0.53 samples/sec: 16.012 | iteration 25530/ 143000 | elapsed time per iteration (ms): 63951.0 | learning rate: 5.590E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.321885E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 15:03:09,289] [INFO] [logging.py:60:log_dist] [Rank 0] step=25540, skipped=29, lr=[0.0005589876243751362, 0.0005589876243751362], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25540 loss: 2.3260 iter time (s): 64.095 samples/sec: 15.976 %comms: 0.0028470361005777053 %optimizer_step 0.057677608070145193 %forward: 22.69655241381372 %backward: 60.898891637738814 [2025-04-12 15:03:09,290] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31335.50 | forward: 145473.58 | backward_microstep: 390343.43 | backward: 390331.52 | backward_inner_microstep: 390309.62 | backward_inner: 390302.40 | backward_allreduce_microstep: 11.64 | backward_allreduce: 3.44 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.25 | step: 369.68 | _step_clipping: 0.13 | _step_step: 367.38 | _step_zero_grad: 0.58 | _step_check_overflow: 0.92 samples/sec: 15.976 | iteration 25540/ 143000 | elapsed time per iteration (ms): 64095.6 | learning rate: 5.590E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.316445E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 15:13:51,054] [INFO] [logging.py:60:log_dist] [Rank 0] step=25550, skipped=29, lr=[0.0005589543542845423, 0.0005589543542845423], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25550 loss: 2.3246 iter time (s): 64.176 samples/sec: 15.956 %comms: 0.0028250970890287865 %optimizer_step 0.05561771507214411 %forward: 22.71766430554619 %backward: 60.8053306599602 [2025-04-12 15:13:51,055] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31969.52 | forward: 145792.56 | backward_microstep: 390233.06 | backward: 390223.42 | backward_inner_microstep: 390206.17 | backward_inner: 390199.68 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.13 | reduce_grads: 0.21 | step: 356.93 | _step_clipping: 0.16 | _step_step: 355.11 | _step_zero_grad: 0.52 | _step_check_overflow: 0.54 samples/sec: 15.956 | iteration 25550/ 143000 | elapsed time per iteration (ms): 64176.5 | learning rate: 5.590E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.321143E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 15:24:23,444] [INFO] [logging.py:60:log_dist] [Rank 0] step=25560, skipped=29, lr=[0.0005589210716956499, 0.0005589210716956499], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25560 loss: 2.3271 iter time (s): 63.238 samples/sec: 16.193 %comms: 0.0028428072143013112 %optimizer_step 0.0564584687227668 %forward: 22.993837757760883 %backward: 61.6978346925253 [2025-04-12 15:24:23,445] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23034.96 | forward: 145409.50 | backward_microstep: 390176.39 | backward: 390167.63 | backward_inner_microstep: 390150.67 | backward_inner: 390144.29 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.21 | step: 357.03 | _step_clipping: 0.12 | _step_step: 355.14 | _step_zero_grad: 0.52 | _step_check_overflow: 0.49 samples/sec: 16.193 | iteration 25560/ 143000 | elapsed time per iteration (ms): 63239.0 | learning rate: 5.589E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.329022E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 15:35:05,359] [INFO] [logging.py:60:log_dist] [Rank 0] step=25570, skipped=29, lr=[0.0005588877766100657, 0.0005588877766100657], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25570 loss: 2.3264 iter time (s): 64.191 samples/sec: 15.952 %comms: 0.002833535920250416 %optimizer_step 0.05760554866332581 %forward: 22.66961021608892 %backward: 60.79730857462155 [2025-04-12 15:35:05,360] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32322.44 | forward: 145518.21 | backward_microstep: 390273.36 | backward: 390263.24 | backward_inner_microstep: 390245.19 | backward_inner: 390238.55 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.01 | reduce_tied_grads: 0.35 | comms: 18.19 | reduce_grads: 0.23 | step: 369.78 | _step_clipping: 0.13 | _step_step: 367.89 | _step_zero_grad: 0.55 | _step_check_overflow: 0.57 samples/sec: 15.952 | iteration 25570/ 143000 | elapsed time per iteration (ms): 64191.5 | learning rate: 5.589E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.321830E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 15:45:45,275] [INFO] [logging.py:60:log_dist] [Rank 0] step=25580, skipped=29, lr=[0.0005588544690293966, 0.0005588544690293966], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25580 loss: 2.3203 iter time (s): 63.991 samples/sec: 16.002 %comms: 0.002830243604841715 %optimizer_step 0.05600374851929634 %forward: 22.720363758885895 %backward: 61.00985258011703 [2025-04-12 15:45:45,275] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30296.81 | forward: 145389.71 | backward_microstep: 390420.98 | backward: 390407.69 | backward_inner_microstep: 390388.89 | backward_inner: 390381.95 | backward_allreduce_microstep: 9.02 | backward_allreduce: 3.11 | reduce_tied_grads: 0.34 | comms: 18.11 | reduce_grads: 0.20 | step: 358.37 | _step_clipping: 0.12 | _step_step: 356.53 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.002 | iteration 25580/ 143000 | elapsed time per iteration (ms): 63991.6 | learning rate: 5.589E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.312819E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 15:56:14,028] [INFO] [logging.py:60:log_dist] [Rank 0] step=25590, skipped=29, lr=[0.00055882114895525, 0.00055882114895525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25590 loss: 2.3273 iter time (s): 62.875 samples/sec: 16.286 %comms: 0.0029066487344012945 %optimizer_step 0.05676836315286224 %forward: 23.118378374630492 %backward: 62.06882438583528 [2025-04-12 15:56:14,029] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19329.80 | forward: 145356.37 | backward_microstep: 390266.17 | backward: 390256.57 | backward_inner_microstep: 390239.65 | backward_inner: 390233.07 | backward_allreduce_microstep: 8.15 | backward_allreduce: 2.78 | reduce_tied_grads: 0.30 | comms: 18.28 | reduce_grads: 0.20 | step: 356.93 | _step_clipping: 0.11 | _step_step: 355.05 | _step_zero_grad: 0.50 | _step_check_overflow: 0.68 samples/sec: 16.286 | iteration 25590/ 143000 | elapsed time per iteration (ms): 62875.4 | learning rate: 5.588E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.326614E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 16:06:43,651] [INFO] [logging.py:60:log_dist] [Rank 0] step=25600, skipped=29, lr=[0.000558787816389234, 0.000558787816389234], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25600 loss: 2.3143 iter time (s): 62.962 samples/sec: 16.264 %comms: 0.002867498713725657 %optimizer_step 0.061470160012983055 %forward: 23.089032845061862 %backward: 61.996616683235054 [2025-04-12 16:06:43,652] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20062.17 | forward: 145372.38 | backward_microstep: 390354.70 | backward: 390340.97 | backward_inner_microstep: 390323.57 | backward_inner: 390315.10 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.84 | reduce_tied_grads: 0.32 | comms: 18.05 | reduce_grads: 0.20 | step: 387.03 | _step_clipping: 0.11 | _step_step: 385.28 | _step_zero_grad: 0.52 | _step_check_overflow: 0.46 samples/sec: 16.264 | iteration 25600/ 143000 | elapsed time per iteration (ms): 62962.2 | learning rate: 5.588E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.325241E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 16:17:25,538] [INFO] [logging.py:60:log_dist] [Rank 0] step=25610, skipped=29, lr=[0.0005587544713329578, 0.0005587544713329578], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25610 loss: 2.3257 iter time (s): 64.188 samples/sec: 15.953 %comms: 0.0028400867817038847 %optimizer_step 0.057364760868011774 %forward: 22.682263677028725 %backward: 60.80828470883225 [2025-04-12 16:17:25,538] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32126.53 | forward: 145593.01 | backward_microstep: 390328.34 | backward: 390316.48 | backward_inner_microstep: 390293.27 | backward_inner: 390286.20 | backward_allreduce_microstep: 9.55 | backward_allreduce: 3.27 | reduce_tied_grads: 0.36 | comms: 18.23 | reduce_grads: 0.27 | step: 368.21 | _step_clipping: 0.12 | _step_step: 366.20 | _step_zero_grad: 0.59 | _step_check_overflow: 0.64 samples/sec: 15.953 | iteration 25610/ 143000 | elapsed time per iteration (ms): 64188.7 | learning rate: 5.588E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.321579E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 16:28:03,872] [INFO] [logging.py:60:log_dist] [Rank 0] step=25620, skipped=29, lr=[0.0005587211137880304, 0.0005587211137880304], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25620 loss: 2.3044 iter time (s): 63.833 samples/sec: 16.042 %comms: 0.0028590637558712634 %optimizer_step 0.05650060078809822 %forward: 22.806940237632777 %backward: 61.13904498691905 [2025-04-12 16:28:03,873] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28669.93 | forward: 145583.26 | backward_microstep: 390279.11 | backward: 390268.12 | backward_inner_microstep: 390249.77 | backward_inner: 390242.97 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.03 | reduce_tied_grads: 0.36 | comms: 18.25 | reduce_grads: 0.23 | step: 360.66 | _step_clipping: 0.13 | _step_step: 358.67 | _step_zero_grad: 0.55 | _step_check_overflow: 0.66 samples/sec: 16.042 | iteration 25620/ 143000 | elapsed time per iteration (ms): 63833.5 | learning rate: 5.587E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.313595E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 16:38:46,502] [INFO] [logging.py:60:log_dist] [Rank 0] step=25630, skipped=29, lr=[0.0005586877437560619, 0.0005586877437560619], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25630 loss: 2.3557 iter time (s): 64.262 samples/sec: 15.935 %comms: 0.002834356669431459 %optimizer_step 0.05676422895854225 %forward: 22.647778452382564 %backward: 60.73384379133812 [2025-04-12 16:38:46,502] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32996.41 | forward: 145539.82 | backward_microstep: 390300.77 | backward: 390289.60 | backward_inner_microstep: 390270.65 | backward_inner: 390263.75 | backward_allreduce_microstep: 9.20 | backward_allreduce: 3.18 | reduce_tied_grads: 0.39 | comms: 18.21 | reduce_grads: 0.25 | step: 364.78 | _step_clipping: 0.13 | _step_step: 362.84 | _step_zero_grad: 0.61 | _step_check_overflow: 0.55 samples/sec: 15.935 | iteration 25630/ 143000 | elapsed time per iteration (ms): 64262.9 | learning rate: 5.587E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.332444E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 16:49:20,581] [INFO] [logging.py:60:log_dist] [Rank 0] step=25640, skipped=29, lr=[0.0005586543612386629, 0.0005586543612386629], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25640 loss: 2.3499 iter time (s): 63.407 samples/sec: 16.150 %comms: 0.002829102821294212 %optimizer_step 0.056426463290922574 %forward: 22.945449411612604 %backward: 61.53173563950285 [2025-04-12 16:49:20,582] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24621.49 | forward: 145491.20 | backward_microstep: 390165.78 | backward: 390156.92 | backward_inner_microstep: 390140.26 | backward_inner: 390133.76 | backward_allreduce_microstep: 8.00 | backward_allreduce: 2.75 | reduce_tied_grads: 0.30 | comms: 17.94 | reduce_grads: 0.20 | step: 357.79 | _step_clipping: 0.13 | _step_step: 355.98 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.149 | iteration 25640/ 143000 | elapsed time per iteration (ms): 63408.0 | learning rate: 5.587E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.354124E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 16:59:54,210] [INFO] [logging.py:60:log_dist] [Rank 0] step=25650, skipped=29, lr=[0.0005586209662374446, 0.0005586209662374446], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25650 loss: 2.3311 iter time (s): 63.362 samples/sec: 16.161 %comms: 0.002857122018826833 %optimizer_step 0.0559365046772652 %forward: 22.95056024521916 %backward: 61.577311674685035 [2025-04-12 16:59:54,210] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24207.13 | forward: 145419.86 | backward_microstep: 390176.82 | backward: 390167.55 | backward_inner_microstep: 390150.81 | backward_inner: 390144.54 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.79 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.21 | step: 354.43 | _step_clipping: 0.12 | _step_step: 352.57 | _step_zero_grad: 0.51 | _step_check_overflow: 0.63 samples/sec: 16.161 | iteration 25650/ 143000 | elapsed time per iteration (ms): 63362.8 | learning rate: 5.586E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.341875E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 17:10:30,076] [INFO] [logging.py:60:log_dist] [Rank 0] step=25660, skipped=29, lr=[0.0005585875587540188, 0.0005585875587540188], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25660 loss: 2.3439 iter time (s): 63.586 samples/sec: 16.104 %comms: 0.002846838460523709 %optimizer_step 0.05913103767999421 %forward: 22.862654013941967 %backward: 61.367232609078734 [2025-04-12 17:10:30,077] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26452.00 | forward: 145374.66 | backward_microstep: 390218.83 | backward: 390210.19 | backward_inner_microstep: 390191.85 | backward_inner: 390185.15 | backward_allreduce_microstep: 8.98 | backward_allreduce: 3.07 | reduce_tied_grads: 0.33 | comms: 18.10 | reduce_grads: 0.23 | step: 375.99 | _step_clipping: 0.12 | _step_step: 374.02 | _step_zero_grad: 0.54 | _step_check_overflow: 0.69 samples/sec: 16.104 | iteration 25660/ 143000 | elapsed time per iteration (ms): 63586.7 | learning rate: 5.586E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.339869E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 17:21:06,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=25670, skipped=29, lr=[0.0005585541387899979, 0.0005585541387899979], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25670 loss: 2.3127 iter time (s): 63.673 samples/sec: 16.082 %comms: 0.0028376585723050005 %optimizer_step 0.0604219377676777 %forward: 22.84862425198309 %backward: 61.284166787712614 [2025-04-12 17:21:06,817] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27178.82 | forward: 145484.77 | backward_microstep: 390225.68 | backward: 390216.62 | backward_inner_microstep: 390198.33 | backward_inner: 390191.68 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.01 | reduce_tied_grads: 0.34 | comms: 18.07 | reduce_grads: 0.24 | step: 384.73 | _step_clipping: 0.13 | _step_step: 382.84 | _step_zero_grad: 0.56 | _step_check_overflow: 0.53 samples/sec: 16.082 | iteration 25670/ 143000 | elapsed time per iteration (ms): 63674.0 | learning rate: 5.586E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.314636E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 17:31:40,280] [INFO] [logging.py:60:log_dist] [Rank 0] step=25680, skipped=29, lr=[0.0005585207063469947, 0.0005585207063469947], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25680 loss: 2.2982 iter time (s): 63.346 samples/sec: 16.165 %comms: 0.0028519138095574256 %optimizer_step 0.05686758967845477 %forward: 22.971011164989825 %backward: 61.613701485048914 [2025-04-12 17:31:40,281] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23794.57 | forward: 145511.83 | backward_microstep: 390311.04 | backward: 390297.24 | backward_inner_microstep: 390279.58 | backward_inner: 390273.09 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.91 | reduce_tied_grads: 0.31 | comms: 18.07 | reduce_grads: 0.22 | step: 360.23 | _step_clipping: 0.13 | _step_step: 358.35 | _step_zero_grad: 0.51 | _step_check_overflow: 0.64 samples/sec: 16.165 | iteration 25680/ 143000 | elapsed time per iteration (ms): 63346.4 | learning rate: 5.585E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.324146E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 17:42:25,022] [INFO] [logging.py:60:log_dist] [Rank 0] step=25690, skipped=29, lr=[0.0005584872614266231, 0.0005584872614266231], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25690 loss: 2.3168 iter time (s): 64.474 samples/sec: 15.882 %comms: 0.003121787911082437 %optimizer_step 0.05614414635948746 %forward: 22.602296391476983 %backward: 60.5383967669235 [2025-04-12 17:42:25,023] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34859.91 | forward: 145725.18 | backward_microstep: 390325.25 | backward: 390312.95 | backward_inner_microstep: 390293.59 | backward_inner: 390284.82 | backward_allreduce_microstep: 9.43 | backward_allreduce: 3.26 | reduce_tied_grads: 0.34 | comms: 20.13 | reduce_grads: 0.23 | step: 361.98 | _step_clipping: 0.12 | _step_step: 357.68 | _step_zero_grad: 0.54 | _step_check_overflow: 0.70 samples/sec: 15.882 | iteration 25690/ 143000 | elapsed time per iteration (ms): 64474.2 | learning rate: 5.585E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.320275E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 17:53:13,948] [INFO] [logging.py:60:log_dist] [Rank 0] step=25700, skipped=29, lr=[0.0005584538040304971, 0.0005584538040304971], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25700 loss: 2.3289 iter time (s): 64.892 samples/sec: 15.780 %comms: 0.0027965682109175535 %optimizer_step 0.058331498420705376 %forward: 22.474106863130512 %backward: 60.149179579709056 [2025-04-12 17:53:13,949] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38895.93 | forward: 145838.80 | backward_microstep: 390333.55 | backward: 390319.59 | backward_inner_microstep: 390300.20 | backward_inner: 390291.12 | backward_allreduce_microstep: 9.28 | backward_allreduce: 3.19 | reduce_tied_grads: 0.36 | comms: 18.15 | reduce_grads: 0.24 | step: 378.52 | _step_clipping: 0.15 | _step_step: 376.37 | _step_zero_grad: 0.64 | _step_check_overflow: 0.66 samples/sec: 15.780 | iteration 25700/ 143000 | elapsed time per iteration (ms): 64892.6 | learning rate: 5.585E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.327403E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 18:04:03,212] [INFO] [logging.py:60:log_dist] [Rank 0] step=25710, skipped=29, lr=[0.0005584203341602316, 0.0005584203341602316], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25710 loss: 2.2938 iter time (s): 64.926 samples/sec: 15.772 %comms: 0.002785928614391336 %optimizer_step 0.05516378081989347 %forward: 22.42504988149257 %backward: 60.11597394923416 [2025-04-12 18:04:03,213] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39515.70 | forward: 145596.43 | backward_microstep: 390319.63 | backward: 390307.76 | backward_inner_microstep: 390289.80 | backward_inner: 390283.14 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.96 | reduce_tied_grads: 0.33 | comms: 18.09 | reduce_grads: 0.21 | step: 358.16 | _step_clipping: 0.13 | _step_step: 356.33 | _step_zero_grad: 0.53 | _step_check_overflow: 0.58 samples/sec: 15.772 | iteration 25710/ 143000 | elapsed time per iteration (ms): 64926.4 | learning rate: 5.584E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.315989E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 18:14:52,663] [INFO] [logging.py:60:log_dist] [Rank 0] step=25720, skipped=29, lr=[0.0005583868518174421, 0.0005583868518174421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25720 loss: 2.3156 iter time (s): 64.945 samples/sec: 15.767 %comms: 0.002821175635804296 %optimizer_step 0.055963081932926206 %forward: 22.41877204629237 %backward: 60.091907938861254 [2025-04-12 18:14:52,664] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39744.82 | forward: 145597.65 | backward_microstep: 390275.53 | backward: 390264.04 | backward_inner_microstep: 390245.51 | backward_inner: 390238.29 | backward_allreduce_microstep: 9.03 | backward_allreduce: 3.23 | reduce_tied_grads: 0.33 | comms: 18.32 | reduce_grads: 0.23 | step: 363.45 | _step_clipping: 0.13 | _step_step: 361.27 | _step_zero_grad: 0.59 | _step_check_overflow: 0.78 samples/sec: 15.767 | iteration 25720/ 143000 | elapsed time per iteration (ms): 64945.2 | learning rate: 5.584E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.320303E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 18:25:29,200] [INFO] [logging.py:60:log_dist] [Rank 0] step=25730, skipped=29, lr=[0.0005583533570037444, 0.0005583533570037444], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25730 loss: 2.3387 iter time (s): 63.653 samples/sec: 16.087 %comms: 0.0028518606450864134 %optimizer_step 0.0564377299872612 %forward: 22.834379835668486 %backward: 61.30952409735516 [2025-04-12 18:25:29,200] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27114.02 | forward: 145347.71 | backward_microstep: 390263.52 | backward: 390253.59 | backward_inner_microstep: 390235.18 | backward_inner: 390228.46 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.11 | reduce_tied_grads: 0.31 | comms: 18.15 | reduce_grads: 0.20 | step: 359.24 | _step_clipping: 0.11 | _step_step: 357.54 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.087 | iteration 25730/ 143000 | elapsed time per iteration (ms): 63653.6 | learning rate: 5.584E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.322200E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 18:36:05,146] [INFO] [logging.py:60:log_dist] [Rank 0] step=25740, skipped=29, lr=[0.0005583198497207552, 0.0005583198497207552], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25740 loss: 2.3199 iter time (s): 63.594 samples/sec: 16.102 %comms: 0.0028634672911955136 %optimizer_step 0.05819919101174225 %forward: 22.889549774883587 %backward: 61.34792676105308 [2025-04-12 18:36:05,146] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26404.61 | forward: 145563.81 | backward_microstep: 390145.11 | backward: 390136.01 | backward_inner_microstep: 390119.39 | backward_inner: 390113.10 | backward_allreduce_microstep: 7.97 | backward_allreduce: 2.75 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.20 | step: 370.11 | _step_clipping: 0.12 | _step_step: 368.30 | _step_zero_grad: 0.55 | _step_check_overflow: 0.51 samples/sec: 16.102 | iteration 25740/ 143000 | elapsed time per iteration (ms): 63594.6 | learning rate: 5.583E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.318470E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 18:46:49,234] [INFO] [logging.py:60:log_dist] [Rank 0] step=25750, skipped=29, lr=[0.0005582863299700917, 0.0005582863299700917], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25750 loss: 2.3094 iter time (s): 64.408 samples/sec: 15.899 %comms: 0.0028153154122343943 %optimizer_step 0.059272134109438816 %forward: 22.62151531068115 %backward: 60.59029240000613 [2025-04-12 18:46:49,234] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34275.77 | forward: 145700.99 | backward_microstep: 390261.92 | backward: 390250.85 | backward_inner_microstep: 390232.13 | backward_inner: 390225.22 | backward_allreduce_microstep: 9.14 | backward_allreduce: 3.31 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.23 | step: 381.76 | _step_clipping: 0.13 | _step_step: 379.76 | _step_zero_grad: 0.58 | _step_check_overflow: 0.59 samples/sec: 15.898 | iteration 25750/ 143000 | elapsed time per iteration (ms): 64408.8 | learning rate: 5.583E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.318755E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 18:57:33,942] [INFO] [logging.py:60:log_dist] [Rank 0] step=25760, skipped=29, lr=[0.0005582527977533717, 0.0005582527977533717], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25760 loss: 2.3316 iter time (s): 64.470 samples/sec: 15.883 %comms: 0.002837899298221589 %optimizer_step 0.05761820535191314 %forward: 22.57854737244938 %backward: 60.529504355090765 [2025-04-12 18:57:33,943] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35053.03 | forward: 145564.42 | backward_microstep: 390245.62 | backward: 390235.13 | backward_inner_microstep: 390217.01 | backward_inner: 390210.31 | backward_allreduce_microstep: 8.74 | backward_allreduce: 3.00 | reduce_tied_grads: 0.55 | comms: 18.30 | reduce_grads: 0.23 | step: 371.47 | _step_clipping: 0.14 | _step_step: 369.45 | _step_zero_grad: 0.55 | _step_check_overflow: 0.68 samples/sec: 15.883 | iteration 25760/ 143000 | elapsed time per iteration (ms): 64470.8 | learning rate: 5.583E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.320293E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 19:08:24,211] [INFO] [logging.py:60:log_dist] [Rank 0] step=25770, skipped=29, lr=[0.0005582192530722137, 0.0005582192530722137], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25770 loss: 2.3234 iter time (s): 65.026 samples/sec: 15.747 %comms: 0.0027986030701040495 %optimizer_step 0.05661861207794751 %forward: 22.447456044950403 %backward: 60.034857045068016 [2025-04-12 19:08:24,212] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40047.88 | forward: 145967.27 | backward_microstep: 390396.48 | backward: 390383.85 | backward_inner_microstep: 390363.90 | backward_inner: 390356.63 | backward_allreduce_microstep: 9.73 | backward_allreduce: 3.39 | reduce_tied_grads: 0.38 | comms: 18.20 | reduce_grads: 0.25 | step: 368.17 | _step_clipping: 0.14 | _step_step: 366.19 | _step_zero_grad: 0.58 | _step_check_overflow: 0.58 samples/sec: 15.747 | iteration 25770/ 143000 | elapsed time per iteration (ms): 65026.9 | learning rate: 5.582E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.330776E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 19:19:17,846] [INFO] [logging.py:60:log_dist] [Rank 0] step=25780, skipped=29, lr=[0.0005581856959282366, 0.0005581856959282366], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25780 loss: 2.3178 iter time (s): 65.363 samples/sec: 15.666 %comms: 0.0027768582883722585 %optimizer_step 0.05639650798602128 %forward: 22.31260980008194 %backward: 59.723412410445164 [2025-04-12 19:19:17,846] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 43549.24 | forward: 145841.52 | backward_microstep: 390382.32 | backward: 390369.08 | backward_inner_microstep: 390349.65 | backward_inner: 390342.54 | backward_allreduce_microstep: 9.32 | backward_allreduce: 3.19 | reduce_tied_grads: 0.36 | comms: 18.15 | reduce_grads: 0.24 | step: 368.62 | _step_clipping: 0.12 | _step_step: 366.63 | _step_zero_grad: 0.63 | _step_check_overflow: 0.58 samples/sec: 15.666 | iteration 25780/ 143000 | elapsed time per iteration (ms): 65363.5 | learning rate: 5.582E-04 | approx flops per GPU: 67.6TFLOPS | lm_loss: 2.325823E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 19:30:10,036] [INFO] [logging.py:60:log_dist] [Rank 0] step=25790, skipped=29, lr=[0.0005581521263230601, 0.0005581521263230601], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25790 loss: 2.3191 iter time (s): 65.218 samples/sec: 15.701 %comms: 0.002792401661622767 %optimizer_step 0.05702308287059818 %forward: 22.36370750117662 %backward: 59.86136198979597 [2025-04-12 19:30:10,037] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 42014.50 | forward: 145852.57 | backward_microstep: 390419.90 | backward: 390406.36 | backward_inner_microstep: 390386.83 | backward_inner: 390379.61 | backward_allreduce_microstep: 9.20 | backward_allreduce: 3.16 | reduce_tied_grads: 0.34 | comms: 18.21 | reduce_grads: 0.23 | step: 371.90 | _step_clipping: 0.14 | _step_step: 369.79 | _step_zero_grad: 0.59 | _step_check_overflow: 0.73 samples/sec: 15.701 | iteration 25790/ 143000 | elapsed time per iteration (ms): 65219.1 | learning rate: 5.582E-04 | approx flops per GPU: 67.7TFLOPS | lm_loss: 2.319883E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 19:40:56,912] [INFO] [logging.py:60:log_dist] [Rank 0] step=25800, skipped=29, lr=[0.0005581185442583044, 0.0005581185442583044], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25800 loss: 2.3141 iter time (s): 64.687 samples/sec: 15.830 %comms: 0.0028081569312439236 %optimizer_step 0.058662726323567276 %forward: 22.506450105928717 %backward: 60.3356453276227 [2025-04-12 19:40:56,913] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37126.85 | forward: 145587.37 | backward_microstep: 390303.97 | backward: 390292.91 | backward_inner_microstep: 390273.54 | backward_inner: 390266.52 | backward_allreduce_microstep: 9.34 | backward_allreduce: 3.21 | reduce_tied_grads: 0.34 | comms: 18.17 | reduce_grads: 0.23 | step: 379.47 | _step_clipping: 0.13 | _step_step: 377.64 | _step_zero_grad: 0.55 | _step_check_overflow: 0.51 samples/sec: 15.830 | iteration 25800/ 143000 | elapsed time per iteration (ms): 64687.6 | learning rate: 5.581E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.317064E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 19:51:52,225] [INFO] [logging.py:60:log_dist] [Rank 0] step=25810, skipped=29, lr=[0.0005580849497355902, 0.0005580849497355902], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25810 loss: 2.3280 iter time (s): 65.531 samples/sec: 15.626 %comms: 0.0027678904361265167 %optimizer_step 0.055740664395852874 %forward: 22.260161409608823 %backward: 59.56293131572205 [2025-04-12 19:51:52,226] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 45265.57 | forward: 145872.32 | backward_microstep: 390333.70 | backward: 390319.85 | backward_inner_microstep: 390300.81 | backward_inner: 390293.77 | backward_allreduce_microstep: 9.13 | backward_allreduce: 3.14 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.22 | step: 365.27 | _step_clipping: 0.12 | _step_step: 363.19 | _step_zero_grad: 0.65 | _step_check_overflow: 0.64 samples/sec: 15.626 | iteration 25810/ 143000 | elapsed time per iteration (ms): 65531.3 | learning rate: 5.581E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.320495E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 20:02:25,136] [INFO] [logging.py:60:log_dist] [Rank 0] step=25820, skipped=29, lr=[0.0005580513427565392, 0.0005580513427565392], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25820 loss: 2.3421 iter time (s): 63.291 samples/sec: 16.179 %comms: 0.0029022870423084358 %optimizer_step 0.06117713995160109 %forward: 22.98440155160822 %backward: 61.67781750343963 [2025-04-12 20:02:25,137] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23179.97 | forward: 145469.44 | backward_microstep: 390381.15 | backward: 390362.02 | backward_inner_microstep: 390342.92 | backward_inner: 390336.46 | backward_allreduce_microstep: 9.91 | backward_allreduce: 4.51 | reduce_tied_grads: 0.33 | comms: 18.37 | reduce_grads: 0.23 | step: 387.19 | _step_clipping: 1.75 | _step_step: 383.64 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.179 | iteration 25820/ 143000 | elapsed time per iteration (ms): 63291.1 | learning rate: 5.581E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.325615E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 20:12:56,706] [INFO] [logging.py:60:log_dist] [Rank 0] step=25830, skipped=29, lr=[0.0005580177233227733, 0.0005580177233227733], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25830 loss: 2.3157 iter time (s): 63.156 samples/sec: 16.214 %comms: 0.00288492908707383 %optimizer_step 0.056316612334905126 %forward: 23.025260198011882 %backward: 61.79472720890515 [2025-04-12 20:12:56,707] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22035.48 | forward: 145419.36 | backward_microstep: 390284.49 | backward: 390273.52 | backward_inner_microstep: 390256.62 | backward_inner: 390250.21 | backward_allreduce_microstep: 8.11 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.22 | reduce_grads: 0.21 | step: 355.68 | _step_clipping: 0.11 | _step_step: 353.74 | _step_zero_grad: 0.57 | _step_check_overflow: 0.64 samples/sec: 16.214 | iteration 25830/ 143000 | elapsed time per iteration (ms): 63157.0 | learning rate: 5.580E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.324102E+00 | loss scale: 524288.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 20:13:59,723] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-12 20:16:05,533] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-12 20:23:26,034] [INFO] [logging.py:60:log_dist] [Rank 0] step=25840, skipped=31, lr=[0.0005579908188094561, 0.0005579908188094561], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25840 loss: 2.3180 iter time (s): 62.932 samples/sec: 16.271 %comms: 0.002317993551994697 %optimizer_step 0.04635922699494201 %forward: 23.093719192251793 %backward: 62.001487063424406 [2025-04-12 20:23:26,035] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20019.01 | forward: 145333.83 | backward_microstep: 390198.86 | backward: 390188.93 | backward_inner_microstep: 390172.29 | backward_inner: 390165.97 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.72 | reduce_tied_grads: 0.27 | comms: 14.59 | reduce_grads: 0.19 | step: 291.75 | _step_clipping: 0.11 | _step_step: 289.94 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.271 | iteration 25840/ 143000 | elapsed time per iteration (ms): 62932.8 | learning rate: 5.580E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.317406E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-12 20:34:14,933] [INFO] [logging.py:60:log_dist] [Rank 0] step=25850, skipped=31, lr=[0.0005579571769612926, 0.0005579571769612926], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25850 loss: 2.3160 iter time (s): 64.889 samples/sec: 15.781 %comms: 0.0027909520941211486 %optimizer_step 0.05559983677192367 %forward: 22.441071490820256 %backward: 60.14212809637897 [2025-04-12 20:34:14,934] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39152.97 | forward: 145618.40 | backward_microstep: 390268.85 | backward: 390257.68 | backward_inner_microstep: 390239.34 | backward_inner: 390232.55 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.05 | reduce_tied_grads: 0.35 | comms: 18.11 | reduce_grads: 0.23 | step: 360.78 | _step_clipping: 0.14 | _step_step: 358.77 | _step_zero_grad: 0.70 | _step_check_overflow: 0.52 samples/sec: 15.781 | iteration 25850/ 143000 | elapsed time per iteration (ms): 64889.9 | learning rate: 5.580E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.314760E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 20:44:59,239] [INFO] [logging.py:60:log_dist] [Rank 0] step=25860, skipped=31, lr=[0.000557923522662959, 0.000557923522662959], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25860 loss: 2.3148 iter time (s): 64.430 samples/sec: 15.893 %comms: 0.00281780406548706 %optimizer_step 0.05893135913888323 %forward: 22.617799568575503 %backward: 60.60042506745288 [2025-04-12 20:44:59,240] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34188.99 | forward: 145726.37 | backward_microstep: 390461.47 | backward: 390448.25 | backward_inner_microstep: 390427.90 | backward_inner: 390420.55 | backward_allreduce_microstep: 9.91 | backward_allreduce: 3.43 | reduce_tied_grads: 0.37 | comms: 18.16 | reduce_grads: 0.24 | step: 379.69 | _step_clipping: 0.12 | _step_step: 377.64 | _step_zero_grad: 0.63 | _step_check_overflow: 0.63 samples/sec: 15.893 | iteration 25860/ 143000 | elapsed time per iteration (ms): 64430.6 | learning rate: 5.579E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.323572E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 20:55:38,423] [INFO] [logging.py:60:log_dist] [Rank 0] step=25870, skipped=31, lr=[0.0005578898559160798, 0.0005578898559160798], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25870 loss: 2.3135 iter time (s): 63.918 samples/sec: 16.021 %comms: 0.0028158003832859024 %optimizer_step 0.05589607521575801 %forward: 22.76858230278952 %backward: 61.06651409593462 [2025-04-12 20:55:38,424] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29403.32 | forward: 145531.80 | backward_microstep: 390333.89 | backward: 390323.80 | backward_inner_microstep: 390303.51 | backward_inner: 390296.80 | backward_allreduce_microstep: 8.87 | backward_allreduce: 3.04 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.23 | step: 357.28 | _step_clipping: 0.13 | _step_step: 355.43 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.020 | iteration 25870/ 143000 | elapsed time per iteration (ms): 63918.4 | learning rate: 5.579E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.323170E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 21:06:18,449] [INFO] [logging.py:60:log_dist] [Rank 0] step=25880, skipped=31, lr=[0.0005578561767222796, 0.0005578561767222796], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25880 loss: 2.3115 iter time (s): 64.002 samples/sec: 16.000 %comms: 0.0028432059232885542 %optimizer_step 0.056903344604825934 %forward: 22.750858726026458 %backward: 60.98485357889021 [2025-04-12 21:06:18,449] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30217.99 | forward: 145609.83 | backward_microstep: 390324.91 | backward: 390314.68 | backward_inner_microstep: 390292.68 | backward_inner: 390284.17 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.07 | reduce_tied_grads: 0.37 | comms: 18.20 | reduce_grads: 0.24 | step: 364.19 | _step_clipping: 0.15 | _step_step: 362.14 | _step_zero_grad: 0.64 | _step_check_overflow: 0.61 samples/sec: 15.999 | iteration 25880/ 143000 | elapsed time per iteration (ms): 64002.5 | learning rate: 5.579E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.317341E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 21:17:01,690] [INFO] [logging.py:60:log_dist] [Rank 0] step=25890, skipped=31, lr=[0.0005578224850831842, 0.0005578224850831842], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25890 loss: 2.3263 iter time (s): 64.324 samples/sec: 15.920 %comms: 0.002794255647470341 %optimizer_step 0.05600027540840759 %forward: 22.620877079118785 %backward: 60.66980899859165 [2025-04-12 21:17:01,691] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33615.65 | forward: 145505.65 | backward_microstep: 390260.67 | backward: 390250.12 | backward_inner_microstep: 390231.48 | backward_inner: 390224.64 | backward_allreduce_microstep: 9.04 | backward_allreduce: 3.11 | reduce_tied_grads: 0.32 | comms: 17.97 | reduce_grads: 0.21 | step: 360.21 | _step_clipping: 0.13 | _step_step: 358.30 | _step_zero_grad: 0.53 | _step_check_overflow: 0.66 samples/sec: 15.919 | iteration 25890/ 143000 | elapsed time per iteration (ms): 64324.2 | learning rate: 5.578E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.325071E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 21:27:38,273] [INFO] [logging.py:60:log_dist] [Rank 0] step=25900, skipped=31, lr=[0.0005577887810004192, 0.0005577887810004192], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25900 loss: 2.3375 iter time (s): 63.658 samples/sec: 16.086 %comms: 0.0028448397650491122 %optimizer_step 0.056303422782350995 %forward: 22.845588186709488 %backward: 61.319959898040246 [2025-04-12 21:27:38,275] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26910.76 | forward: 145429.47 | backward_microstep: 390359.20 | backward: 390347.99 | backward_inner_microstep: 390330.21 | backward_inner: 390323.64 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.11 | reduce_grads: 0.21 | step: 358.41 | _step_clipping: 0.12 | _step_step: 356.38 | _step_zero_grad: 0.70 | _step_check_overflow: 0.60 samples/sec: 16.086 | iteration 25900/ 143000 | elapsed time per iteration (ms): 63658.4 | learning rate: 5.578E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.333865E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 21:38:20,049] [INFO] [logging.py:60:log_dist] [Rank 0] step=25910, skipped=31, lr=[0.000557755064475612, 0.000557755064475612], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25910 loss: 2.3117 iter time (s): 64.177 samples/sec: 15.956 %comms: 0.002825017220761544 %optimizer_step 0.05600903222680365 %forward: 22.683147012878994 %backward: 60.81138890199034 [2025-04-12 21:38:20,049] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32047.38 | forward: 145573.24 | backward_microstep: 390279.24 | backward: 390268.20 | backward_inner_microstep: 390249.41 | backward_inner: 390242.59 | backward_allreduce_microstep: 9.19 | backward_allreduce: 3.10 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.24 | step: 359.45 | _step_clipping: 0.11 | _step_step: 357.68 | _step_zero_grad: 0.55 | _step_check_overflow: 0.48 samples/sec: 15.956 | iteration 25910/ 143000 | elapsed time per iteration (ms): 64177.4 | learning rate: 5.578E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.331743E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 21:48:55,345] [INFO] [logging.py:60:log_dist] [Rank 0] step=25920, skipped=31, lr=[0.0005577213355103895, 0.0005577213355103895], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25920 loss: 2.3101 iter time (s): 63.529 samples/sec: 16.119 %comms: 0.0028470273154887473 %optimizer_step 0.054953834490585586 %forward: 22.89354200438564 %backward: 61.40176264226538 [2025-04-12 21:48:55,346] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25946.18 | forward: 145440.63 | backward_microstep: 390088.38 | backward: 390079.91 | backward_inner_microstep: 390061.83 | backward_inner: 390055.37 | backward_allreduce_microstep: 8.91 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.09 | reduce_grads: 0.22 | step: 349.12 | _step_clipping: 0.13 | _step_step: 347.48 | _step_zero_grad: 0.47 | _step_check_overflow: 0.46 samples/sec: 16.118 | iteration 25920/ 143000 | elapsed time per iteration (ms): 63529.6 | learning rate: 5.577E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.324299E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 21:59:34,668] [INFO] [logging.py:60:log_dist] [Rank 0] step=25930, skipped=31, lr=[0.0005576875941063797, 0.0005576875941063797], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25930 loss: 2.3151 iter time (s): 63.932 samples/sec: 16.017 %comms: 0.0028015017236638204 %optimizer_step 0.05493370386876282 %forward: 22.776520593841333 %backward: 61.04382585581861 [2025-04-12 21:59:34,668] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29569.59 | forward: 145614.20 | backward_microstep: 390273.48 | backward: 390263.64 | backward_inner_microstep: 390245.96 | backward_inner: 390239.37 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.95 | reduce_tied_grads: 0.28 | comms: 17.91 | reduce_grads: 0.20 | step: 351.20 | _step_clipping: 0.12 | _step_step: 349.55 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.017 | iteration 25930/ 143000 | elapsed time per iteration (ms): 63932.3 | learning rate: 5.577E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.318280E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 22:10:14,214] [INFO] [logging.py:60:log_dist] [Rank 0] step=25940, skipped=31, lr=[0.000557653840265211, 0.000557653840265211], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25940 loss: 2.3362 iter time (s): 63.954 samples/sec: 16.012 %comms: 0.0028144309541383913 %optimizer_step 0.05869409172752283 %forward: 22.774253152590546 %backward: 61.02394473753896 [2025-04-12 22:10:14,214] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29717.36 | forward: 145650.45 | backward_microstep: 390282.49 | backward: 390272.51 | backward_inner_microstep: 390254.23 | backward_inner: 390247.45 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.06 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.21 | step: 375.37 | _step_clipping: 0.13 | _step_step: 373.57 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.011 | iteration 25940/ 143000 | elapsed time per iteration (ms): 63954.6 | learning rate: 5.577E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.327407E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 22:20:44,394] [INFO] [logging.py:60:log_dist] [Rank 0] step=25950, skipped=31, lr=[0.0005576200739885128, 0.0005576200739885128], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25950 loss: 2.3105 iter time (s): 63.017 samples/sec: 16.249 %comms: 0.0029056300472607576 %optimizer_step 0.05862301572343904 %forward: 23.097524453783425 %backward: 61.931663127821764 [2025-04-12 22:20:44,395] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20469.20 | forward: 145554.77 | backward_microstep: 390287.79 | backward: 390277.70 | backward_inner_microstep: 390260.17 | backward_inner: 390253.23 | backward_allreduce_microstep: 8.47 | backward_allreduce: 2.94 | reduce_tied_grads: 0.35 | comms: 18.31 | reduce_grads: 0.24 | step: 369.43 | _step_clipping: 0.14 | _step_step: 367.40 | _step_zero_grad: 0.58 | _step_check_overflow: 0.65 samples/sec: 16.249 | iteration 25950/ 143000 | elapsed time per iteration (ms): 63018.1 | learning rate: 5.576E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.320103E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 22:31:28,709] [INFO] [logging.py:60:log_dist] [Rank 0] step=25960, skipped=31, lr=[0.0005575862952779144, 0.0005575862952779144], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25960 loss: 2.3177 iter time (s): 64.431 samples/sec: 15.893 %comms: 0.002825386571026955 %optimizer_step 0.056692121253205725 %forward: 22.576444629526822 %backward: 60.558567379912866 [2025-04-12 22:31:28,710] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34767.40 | forward: 145462.00 | backward_microstep: 390194.36 | backward: 390184.13 | backward_inner_microstep: 390165.87 | backward_inner: 390159.07 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.06 | reduce_tied_grads: 0.37 | comms: 18.20 | reduce_grads: 0.23 | step: 365.27 | _step_clipping: 0.14 | _step_step: 363.41 | _step_zero_grad: 0.59 | _step_check_overflow: 0.49 samples/sec: 15.893 | iteration 25960/ 143000 | elapsed time per iteration (ms): 64431.5 | learning rate: 5.576E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.322117E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 22:41:57,613] [INFO] [logging.py:60:log_dist] [Rank 0] step=25970, skipped=31, lr=[0.0005575525041350465, 0.0005575525041350465], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25970 loss: 2.3169 iter time (s): 62.890 samples/sec: 16.282 %comms: 0.0029209682703385375 %optimizer_step 0.05802699144217674 %forward: 23.121291715836083 %backward: 62.04875986599407 [2025-04-12 22:41:57,614] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19373.79 | forward: 145409.36 | backward_microstep: 390234.12 | backward: 390223.45 | backward_inner_microstep: 390203.19 | backward_inner: 390196.36 | backward_allreduce_microstep: 10.77 | backward_allreduce: 3.11 | reduce_tied_grads: 0.37 | comms: 18.37 | reduce_grads: 0.25 | step: 364.93 | _step_clipping: 0.14 | _step_step: 362.98 | _step_zero_grad: 0.57 | _step_check_overflow: 0.59 samples/sec: 16.282 | iteration 25970/ 143000 | elapsed time per iteration (ms): 62890.4 | learning rate: 5.576E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.317783E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 22:52:38,492] [INFO] [logging.py:60:log_dist] [Rank 0] step=25980, skipped=31, lr=[0.0005575187005615398, 0.0005575187005615398], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25980 loss: 2.3118 iter time (s): 64.087 samples/sec: 15.978 %comms: 0.00281430705268584 %optimizer_step 0.05577944748132878 %forward: 22.742759003614427 %backward: 60.9060170580467 [2025-04-12 22:52:38,492] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30891.73 | forward: 145752.15 | backward_microstep: 390342.10 | backward: 390330.07 | backward_inner_microstep: 390310.37 | backward_inner: 390303.07 | backward_allreduce_microstep: 9.61 | backward_allreduce: 3.30 | reduce_tied_grads: 0.35 | comms: 18.04 | reduce_grads: 0.22 | step: 357.48 | _step_clipping: 0.12 | _step_step: 355.74 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 15.978 | iteration 25980/ 143000 | elapsed time per iteration (ms): 64087.9 | learning rate: 5.575E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.308376E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 23:03:21,335] [INFO] [logging.py:60:log_dist] [Rank 0] step=25990, skipped=31, lr=[0.0005574848845590259, 0.0005574848845590259], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 25990 loss: 2.3323 iter time (s): 64.284 samples/sec: 15.929 %comms: 0.002803329682878972 %optimizer_step 0.055819695917301265 %forward: 22.63958433472892 %backward: 60.69791152068436 [2025-04-12 23:03:21,336] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33241.97 | forward: 145535.85 | backward_microstep: 390199.35 | backward: 390189.24 | backward_inner_microstep: 390171.22 | backward_inner: 390164.37 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.02 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.22 | step: 358.83 | _step_clipping: 0.13 | _step_step: 356.90 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 15.929 | iteration 25990/ 143000 | elapsed time per iteration (ms): 64284.4 | learning rate: 5.575E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.331567E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 23:13:54,116] [INFO] [logging.py:60:log_dist] [Rank 0] step=26000, skipped=31, lr=[0.0005574510561291368, 0.0005574510561291368], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26000 loss: 2.3400 iter time (s): 63.277 samples/sec: 16.183 %comms: 0.0028397726258047504 %optimizer_step 0.055617969275916355 %forward: 22.96598088676545 %backward: 61.66948834430907 [2025-04-12 23:13:54,117] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23372.65 | forward: 145322.97 | backward_microstep: 390239.99 | backward: 390229.08 | backward_inner_microstep: 390211.03 | backward_inner: 390204.32 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.96 | reduce_tied_grads: 0.29 | comms: 17.97 | reduce_grads: 0.20 | step: 351.94 | _step_clipping: 0.13 | _step_step: 350.16 | _step_zero_grad: 0.50 | _step_check_overflow: 0.57 samples/sec: 16.183 | iteration 26000/ 143000 | elapsed time per iteration (ms): 63278.1 | learning rate: 5.575E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.325131E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 23:13:56,970] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step26000/mp_rank_00_model_states.pt [2025-04-12 23:14:11,186] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-12 23:14:11,192] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step26000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-12 23:24:54,249] [INFO] [logging.py:60:log_dist] [Rank 0] step=26010, skipped=31, lr=[0.0005574172152735053, 0.0005574172152735053], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26010 loss: 2.3396 iter time (s): 64.304 samples/sec: 15.924 %comms: 0.0028175272348032465 %optimizer_step 0.05840603596317538 %forward: 22.641004096290548 %backward: 60.6813343602838 [2025-04-12 23:24:54,250] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33333.26 | forward: 145591.34 | backward_microstep: 390221.06 | backward: 390206.92 | backward_inner_microstep: 390187.40 | backward_inner: 390180.05 | backward_allreduce_microstep: 9.48 | backward_allreduce: 3.32 | reduce_tied_grads: 0.36 | comms: 18.12 | reduce_grads: 0.24 | step: 375.58 | _step_clipping: 0.12 | _step_step: 373.76 | _step_zero_grad: 0.57 | _step_check_overflow: 0.48 samples/sec: 15.512 | iteration 26010/ 143000 | elapsed time per iteration (ms): 66013.3 | learning rate: 5.574E-04 | approx flops per GPU: 66.9TFLOPS | lm_loss: 2.322099E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 23:35:40,519] [INFO] [logging.py:60:log_dist] [Rank 0] step=26020, skipped=31, lr=[0.0005573833619937647, 0.0005573833619937647], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26020 loss: 2.3227 iter time (s): 64.626 samples/sec: 15.845 %comms: 0.0027866593393000927 %optimizer_step 0.055718540739643795 %forward: 22.531463513889026 %backward: 60.385093801933856 [2025-04-12 23:35:40,520] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36536.26 | forward: 145612.82 | backward_microstep: 390259.16 | backward: 390247.33 | backward_inner_microstep: 390228.64 | backward_inner: 390221.58 | backward_allreduce_microstep: 8.97 | backward_allreduce: 3.07 | reduce_tied_grads: 0.29 | comms: 18.01 | reduce_grads: 0.21 | step: 360.09 | _step_clipping: 0.13 | _step_step: 358.29 | _step_zero_grad: 0.54 | _step_check_overflow: 0.53 samples/sec: 15.845 | iteration 26020/ 143000 | elapsed time per iteration (ms): 64627.0 | learning rate: 5.574E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.321805E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 23:46:33,756] [INFO] [logging.py:60:log_dist] [Rank 0] step=26030, skipped=31, lr=[0.0005573494962915489, 0.0005573494962915489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26030 loss: 2.3163 iter time (s): 65.323 samples/sec: 15.676 %comms: 0.0027579281213080905 %optimizer_step 0.05562471991018636 %forward: 22.310577310586336 %backward: 59.74745713911965 [2025-04-12 23:46:33,756] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 43337.50 | forward: 145739.46 | backward_microstep: 390299.53 | backward: 390288.51 | backward_inner_microstep: 390270.08 | backward_inner: 390263.21 | backward_allreduce_microstep: 8.87 | backward_allreduce: 3.07 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.21 | step: 363.36 | _step_clipping: 0.13 | _step_step: 361.57 | _step_zero_grad: 0.57 | _step_check_overflow: 0.48 samples/sec: 15.676 | iteration 26030/ 143000 | elapsed time per iteration (ms): 65323.6 | learning rate: 5.573E-04 | approx flops per GPU: 67.6TFLOPS | lm_loss: 2.314135E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-12 23:57:05,411] [INFO] [logging.py:60:log_dist] [Rank 0] step=26040, skipped=31, lr=[0.0005573156181684925, 0.0005573156181684925], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26040 loss: 2.3159 iter time (s): 63.165 samples/sec: 16.212 %comms: 0.0028686106533351145 %optimizer_step 0.05698800290814365 %forward: 23.006034855279104 %backward: 61.76952403600539 [2025-04-12 23:57:05,412] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22287.49 | forward: 145317.57 | backward_microstep: 390176.71 | backward: 390167.08 | backward_inner_microstep: 390149.53 | backward_inner: 390143.10 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.89 | reduce_tied_grads: 0.32 | comms: 18.12 | reduce_grads: 0.22 | step: 359.96 | _step_clipping: 0.13 | _step_step: 358.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 16.211 | iteration 26040/ 143000 | elapsed time per iteration (ms): 63165.6 | learning rate: 5.573E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.314848E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 00:07:38,121] [INFO] [logging.py:60:log_dist] [Rank 0] step=26050, skipped=31, lr=[0.0005572817276262305, 0.0005572817276262305], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26050 loss: 2.3080 iter time (s): 63.270 samples/sec: 16.184 %comms: 0.002836774196436323 %optimizer_step 0.05602984195263068 %forward: 23.009071701365315 %backward: 61.66241804265291 [2025-04-13 00:07:38,122] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23131.26 | forward: 145579.36 | backward_microstep: 390149.14 | backward: 390140.70 | backward_inner_microstep: 390123.69 | backward_inner: 390117.32 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.84 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.19 | step: 354.50 | _step_clipping: 0.12 | _step_step: 352.80 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.184 | iteration 26050/ 143000 | elapsed time per iteration (ms): 63271.0 | learning rate: 5.573E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.317548E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 00:18:26,926] [INFO] [logging.py:60:log_dist] [Rank 0] step=26060, skipped=31, lr=[0.0005572478246663985, 0.0005572478246663985], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26060 loss: 2.3246 iter time (s): 64.880 samples/sec: 15.783 %comms: 0.002817743398605314 %optimizer_step 0.06016215458046725 %forward: 22.450992594172703 %backward: 60.17277743412739 [2025-04-13 00:18:26,926] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38790.40 | forward: 145661.57 | backward_microstep: 390412.32 | backward: 390399.71 | backward_inner_microstep: 390381.04 | backward_inner: 390374.13 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.12 | reduce_tied_grads: 0.38 | comms: 18.28 | reduce_grads: 0.23 | step: 390.33 | _step_clipping: 0.15 | _step_step: 388.38 | _step_zero_grad: 0.59 | _step_check_overflow: 0.53 samples/sec: 15.783 | iteration 26060/ 143000 | elapsed time per iteration (ms): 64880.4 | learning rate: 5.572E-04 | approx flops per GPU: 68.1TFLOPS | lm_loss: 2.334389E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 00:29:24,633] [INFO] [logging.py:60:log_dist] [Rank 0] step=26070, skipped=31, lr=[0.0005572139092906331, 0.0005572139092906331], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26070 loss: 2.3277 iter time (s): 65.770 samples/sec: 15.569 %comms: 0.0027521567495083525 %optimizer_step 0.05610436164474032 %forward: 22.205104925279166 %backward: 59.35599491461202 [2025-04-13 00:29:24,634] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 47345.71 | forward: 146043.31 | backward_microstep: 390398.82 | backward: 390385.28 | backward_inner_microstep: 390364.94 | backward_inner: 390353.76 | backward_allreduce_microstep: 9.75 | backward_allreduce: 3.36 | reduce_tied_grads: 0.37 | comms: 18.10 | reduce_grads: 0.24 | step: 369.00 | _step_clipping: 0.14 | _step_step: 366.82 | _step_zero_grad: 0.64 | _step_check_overflow: 0.71 samples/sec: 15.569 | iteration 26070/ 143000 | elapsed time per iteration (ms): 65770.8 | learning rate: 5.572E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.322517E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 00:40:02,978] [INFO] [logging.py:60:log_dist] [Rank 0] step=26080, skipped=31, lr=[0.0005571799815005711, 0.0005571799815005711], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26080 loss: 2.3199 iter time (s): 63.834 samples/sec: 16.042 %comms: 0.0028413914209863412 %optimizer_step 0.05596922958981379 %forward: 22.829726585727634 %backward: 61.150399745227865 [2025-04-13 00:40:02,979] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28344.39 | forward: 145730.92 | backward_microstep: 390357.12 | backward: 390346.50 | backward_inner_microstep: 390328.60 | backward_inner: 390321.97 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.96 | reduce_tied_grads: 0.29 | comms: 18.14 | reduce_grads: 0.19 | step: 357.27 | _step_clipping: 0.12 | _step_step: 355.46 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.041 | iteration 26080/ 143000 | elapsed time per iteration (ms): 63834.5 | learning rate: 5.572E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.319466E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 00:50:47,186] [INFO] [logging.py:60:log_dist] [Rank 0] step=26090, skipped=31, lr=[0.0005571460412978498, 0.0005571460412978498], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26090 loss: 2.3012 iter time (s): 64.420 samples/sec: 15.896 %comms: 0.002828481764057854 %optimizer_step 0.0564672658920909 %forward: 22.58223470793048 %backward: 60.59316533897535 [2025-04-13 00:50:47,187] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34461.83 | forward: 145475.24 | backward_microstep: 390357.27 | backward: 390342.46 | backward_inner_microstep: 390322.30 | backward_inner: 390315.33 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.07 | reduce_tied_grads: 0.38 | comms: 18.22 | reduce_grads: 0.23 | step: 363.76 | _step_clipping: 0.13 | _step_step: 361.83 | _step_zero_grad: 0.59 | _step_check_overflow: 0.58 samples/sec: 15.895 | iteration 26090/ 143000 | elapsed time per iteration (ms): 64420.8 | learning rate: 5.571E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.312391E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 01:01:29,249] [INFO] [logging.py:60:log_dist] [Rank 0] step=26100, skipped=31, lr=[0.0005571120886841076, 0.0005571120886841076], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26100 loss: 2.3129 iter time (s): 64.206 samples/sec: 15.949 %comms: 0.002823896778302735 %optimizer_step 0.05565409012937181 %forward: 22.704593078934344 %backward: 60.804745806012626 [2025-04-13 01:01:29,250] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31963.57 | forward: 145776.35 | backward_microstep: 390413.53 | backward: 390400.93 | backward_inner_microstep: 390382.49 | backward_inner: 390375.53 | backward_allreduce_microstep: 8.87 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 18.13 | reduce_grads: 0.22 | step: 357.33 | _step_clipping: 0.13 | _step_step: 355.46 | _step_zero_grad: 0.59 | _step_check_overflow: 0.49 samples/sec: 15.949 | iteration 26100/ 143000 | elapsed time per iteration (ms): 64206.3 | learning rate: 5.571E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.314898E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 01:12:08,631] [INFO] [logging.py:60:log_dist] [Rank 0] step=26110, skipped=31, lr=[0.000557078123660983, 0.000557078123660983], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26110 loss: 2.3256 iter time (s): 63.938 samples/sec: 16.016 %comms: 0.002850057459080265 %optimizer_step 0.05627971335503198 %forward: 22.749571057887742 %backward: 61.03104220229285 [2025-04-13 01:12:08,631] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29822.00 | forward: 145455.19 | backward_microstep: 390227.80 | backward: 390217.56 | backward_inner_microstep: 390200.87 | backward_inner: 390194.46 | backward_allreduce_microstep: 7.99 | backward_allreduce: 2.73 | reduce_tied_grads: 0.34 | comms: 18.22 | reduce_grads: 0.20 | step: 359.84 | _step_clipping: 0.12 | _step_step: 358.12 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.015 | iteration 26110/ 143000 | elapsed time per iteration (ms): 63938.1 | learning rate: 5.571E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.317934E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 01:22:37,979] [INFO] [logging.py:60:log_dist] [Rank 0] step=26120, skipped=31, lr=[0.0005570441462301154, 0.0005570441462301154], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26120 loss: 2.3138 iter time (s): 62.934 samples/sec: 16.271 %comms: 0.002858328771066197 %optimizer_step 0.060786291755556325 %forward: 23.08247664784509 %backward: 61.995994680908154 [2025-04-13 01:22:37,979] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20000.82 | forward: 145267.87 | backward_microstep: 390177.54 | backward: 390167.23 | backward_inner_microstep: 390148.67 | backward_inner: 390141.97 | backward_allreduce_microstep: 9.05 | backward_allreduce: 3.11 | reduce_tied_grads: 0.33 | comms: 17.99 | reduce_grads: 0.22 | step: 382.55 | _step_clipping: 0.13 | _step_step: 380.76 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.271 | iteration 26120/ 143000 | elapsed time per iteration (ms): 62934.8 | learning rate: 5.570E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.317900E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 01:33:24,233] [INFO] [logging.py:60:log_dist] [Rank 0] step=26130, skipped=31, lr=[0.0005570101563931447, 0.0005570101563931447], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26130 loss: 2.3017 iter time (s): 64.625 samples/sec: 15.845 %comms: 0.0027883145484467716 %optimizer_step 0.055255068609578635 %forward: 22.54673456129522 %backward: 60.3900537542877 [2025-04-13 01:33:24,233] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36405.38 | forward: 145707.91 | backward_microstep: 390280.26 | backward: 390269.75 | backward_inner_microstep: 390251.84 | backward_inner: 390244.97 | backward_allreduce_microstep: 8.71 | backward_allreduce: 2.96 | reduce_tied_grads: 0.32 | comms: 18.02 | reduce_grads: 0.21 | step: 357.08 | _step_clipping: 0.13 | _step_step: 355.30 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 15.845 | iteration 26130/ 143000 | elapsed time per iteration (ms): 64625.4 | learning rate: 5.570E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.314274E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 01:43:53,723] [INFO] [logging.py:60:log_dist] [Rank 0] step=26140, skipped=31, lr=[0.0005569761541517113, 0.0005569761541517113], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26140 loss: 2.3068 iter time (s): 62.948 samples/sec: 16.267 %comms: 0.002859390230244078 %optimizer_step 0.058121201688584256 %forward: 23.098005657522584 %backward: 61.97830750029265 [2025-04-13 01:43:53,723] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20077.51 | forward: 145398.30 | backward_microstep: 390152.69 | backward: 390143.67 | backward_inner_microstep: 390126.60 | backward_inner: 390120.22 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.84 | reduce_tied_grads: 0.32 | comms: 18.00 | reduce_grads: 0.20 | step: 365.86 | _step_clipping: 0.12 | _step_step: 364.04 | _step_zero_grad: 0.49 | _step_check_overflow: 0.64 samples/sec: 16.267 | iteration 26140/ 143000 | elapsed time per iteration (ms): 62949.0 | learning rate: 5.570E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.319352E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 01:54:27,852] [INFO] [logging.py:60:log_dist] [Rank 0] step=26150, skipped=31, lr=[0.0005569421395074564, 0.0005569421395074564], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26150 loss: 2.3171 iter time (s): 63.412 samples/sec: 16.148 %comms: 0.002844825875296927 %optimizer_step 0.05619377061923978 %forward: 22.9359921449539 %backward: 61.53071103417157 [2025-04-13 01:54:27,853] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24635.30 | forward: 145442.47 | backward_microstep: 390190.16 | backward: 390180.57 | backward_inner_microstep: 390163.01 | backward_inner: 390156.33 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.89 | reduce_tied_grads: 0.31 | comms: 18.04 | reduce_grads: 0.21 | step: 356.34 | _step_clipping: 0.12 | _step_step: 354.45 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 16.148 | iteration 26150/ 143000 | elapsed time per iteration (ms): 63412.9 | learning rate: 5.569E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.317558E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 02:05:05,548] [INFO] [logging.py:60:log_dist] [Rank 0] step=26160, skipped=31, lr=[0.0005569081124620217, 0.0005569081124620217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26160 loss: 2.3116 iter time (s): 63.769 samples/sec: 16.058 %comms: 0.002848917717586073 %optimizer_step 0.056722359658716144 %forward: 22.833239812954776 %backward: 61.196444374516844 [2025-04-13 02:05:05,548] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27973.34 | forward: 145605.24 | backward_microstep: 390255.51 | backward: 390243.48 | backward_inner_microstep: 390225.39 | backward_inner: 390218.72 | backward_allreduce_microstep: 8.68 | backward_allreduce: 2.98 | reduce_tied_grads: 0.33 | comms: 18.17 | reduce_grads: 0.22 | step: 361.71 | _step_clipping: 0.13 | _step_step: 359.74 | _step_zero_grad: 0.60 | _step_check_overflow: 0.62 samples/sec: 16.058 | iteration 26160/ 143000 | elapsed time per iteration (ms): 63769.6 | learning rate: 5.569E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.314538E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 02:16:03,250] [INFO] [logging.py:60:log_dist] [Rank 0] step=26170, skipped=31, lr=[0.0005568740730170496, 0.0005568740730170496], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26170 loss: 2.3402 iter time (s): 65.770 samples/sec: 15.570 %comms: 0.0028284513367871113 %optimizer_step 0.05603643324596234 %forward: 22.170662300333866 %backward: 59.35684715610425 [2025-04-13 02:16:03,251] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 47558.91 | forward: 145815.55 | backward_microstep: 390400.79 | backward: 390387.57 | backward_inner_microstep: 390367.48 | backward_inner: 390360.11 | backward_allreduce_microstep: 9.82 | backward_allreduce: 3.51 | reduce_tied_grads: 0.53 | comms: 18.60 | reduce_grads: 0.26 | step: 368.55 | _step_clipping: 0.13 | _step_step: 366.64 | _step_zero_grad: 0.59 | _step_check_overflow: 0.55 samples/sec: 15.569 | iteration 26170/ 143000 | elapsed time per iteration (ms): 65770.2 | learning rate: 5.569E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.323896E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 02:26:40,794] [INFO] [logging.py:60:log_dist] [Rank 0] step=26180, skipped=31, lr=[0.0005568400211741828, 0.0005568400211741828], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26180 loss: 2.3300 iter time (s): 63.754 samples/sec: 16.062 %comms: 0.002850868080635804 %optimizer_step 0.056686736756014905 %forward: 22.827692149525827 %backward: 61.223264386142326 [2025-04-13 02:26:40,794] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27793.89 | forward: 145535.18 | backward_microstep: 390332.93 | backward: 390321.49 | backward_inner_microstep: 390302.99 | backward_inner: 390296.25 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.07 | reduce_tied_grads: 0.33 | comms: 18.18 | reduce_grads: 0.28 | step: 361.40 | _step_clipping: 0.14 | _step_step: 359.48 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.062 | iteration 26180/ 143000 | elapsed time per iteration (ms): 63754.4 | learning rate: 5.568E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.328526E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 02:37:16,535] [INFO] [logging.py:60:log_dist] [Rank 0] step=26190, skipped=31, lr=[0.0005568059569350649, 0.0005568059569350649], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26190 loss: 2.3322 iter time (s): 63.574 samples/sec: 16.107 %comms: 0.0028280465043023597 %optimizer_step 0.055951697561250534 %forward: 22.87683596754495 %backward: 61.38038805137109 [2025-04-13 02:37:16,536] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26222.16 | forward: 145436.26 | backward_microstep: 390226.16 | backward: 390217.16 | backward_inner_microstep: 390199.89 | backward_inner: 390193.41 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.84 | reduce_tied_grads: 0.30 | comms: 17.98 | reduce_grads: 0.21 | step: 355.71 | _step_clipping: 0.13 | _step_step: 353.85 | _step_zero_grad: 0.50 | _step_check_overflow: 0.65 samples/sec: 16.107 | iteration 26190/ 143000 | elapsed time per iteration (ms): 63574.2 | learning rate: 5.568E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.319273E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 02:47:53,246] [INFO] [logging.py:60:log_dist] [Rank 0] step=26200, skipped=31, lr=[0.0005567718803013399, 0.0005567718803013399], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26200 loss: 2.3123 iter time (s): 63.670 samples/sec: 16.083 %comms: 0.0028359515723091243 %optimizer_step 0.055842015247924724 %forward: 22.841229336787745 %backward: 61.284028859920845 [2025-04-13 02:47:53,246] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27194.65 | forward: 145431.13 | backward_microstep: 390208.08 | backward: 390198.17 | backward_inner_microstep: 390180.82 | backward_inner: 390174.34 | backward_allreduce_microstep: 8.34 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 18.06 | reduce_grads: 0.20 | step: 355.55 | _step_clipping: 0.12 | _step_step: 353.71 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.083 | iteration 26200/ 143000 | elapsed time per iteration (ms): 63671.1 | learning rate: 5.568E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.323383E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 02:58:30,419] [INFO] [logging.py:60:log_dist] [Rank 0] step=26210, skipped=31, lr=[0.0005567377912746525, 0.0005567377912746525], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26210 loss: 2.3002 iter time (s): 63.717 samples/sec: 16.071 %comms: 0.0028284697156714814 %optimizer_step 0.05645407028030753 %forward: 22.83164990497889 %backward: 61.24842560758152 [2025-04-13 02:58:30,419] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27563.45 | forward: 145475.62 | backward_microstep: 390264.38 | backward: 390254.43 | backward_inner_microstep: 390236.09 | backward_inner: 390229.35 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.07 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.21 | step: 359.71 | _step_clipping: 0.12 | _step_step: 357.99 | _step_zero_grad: 0.53 | _step_check_overflow: 0.46 samples/sec: 16.071 | iteration 26210/ 143000 | elapsed time per iteration (ms): 63717.2 | learning rate: 5.567E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.318812E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 03:09:17,745] [INFO] [logging.py:60:log_dist] [Rank 0] step=26220, skipped=31, lr=[0.0005567036898566481, 0.0005567036898566481], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26220 loss: 2.3344 iter time (s): 64.732 samples/sec: 15.819 %comms: 0.0028014840306047665 %optimizer_step 0.05705831312250434 %forward: 22.497793003536025 %backward: 60.28944613895355 [2025-04-13 03:09:17,746] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37546.34 | forward: 145632.93 | backward_microstep: 390276.48 | backward: 390266.24 | backward_inner_microstep: 390247.46 | backward_inner: 390240.53 | backward_allreduce_microstep: 9.11 | backward_allreduce: 3.12 | reduce_tied_grads: 0.34 | comms: 18.13 | reduce_grads: 0.24 | step: 369.35 | _step_clipping: 0.13 | _step_step: 367.39 | _step_zero_grad: 0.61 | _step_check_overflow: 0.55 samples/sec: 15.819 | iteration 26220/ 143000 | elapsed time per iteration (ms): 64732.7 | learning rate: 5.567E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.320518E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 03:19:54,921] [INFO] [logging.py:60:log_dist] [Rank 0] step=26230, skipped=31, lr=[0.0005566695760489726, 0.0005566695760489726], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26230 loss: 2.3338 iter time (s): 63.717 samples/sec: 16.071 %comms: 0.003085149083451487 %optimizer_step 0.06142544921600416 %forward: 22.81062210071665 %backward: 61.23772940044161 [2025-04-13 03:19:54,922] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27737.29 | forward: 145342.20 | backward_microstep: 390197.35 | backward: 390187.79 | backward_inner_microstep: 390168.78 | backward_inner: 390161.99 | backward_allreduce_microstep: 9.27 | backward_allreduce: 3.19 | reduce_tied_grads: 0.35 | comms: 19.66 | reduce_grads: 0.22 | step: 391.38 | _step_clipping: 0.14 | _step_step: 389.36 | _step_zero_grad: 0.64 | _step_check_overflow: 0.57 samples/sec: 16.071 | iteration 26230/ 143000 | elapsed time per iteration (ms): 63717.6 | learning rate: 5.567E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.328214E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 03:30:28,290] [INFO] [logging.py:60:log_dist] [Rank 0] step=26240, skipped=31, lr=[0.0005566354498532723, 0.0005566354498532723], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26240 loss: 2.3086 iter time (s): 63.336 samples/sec: 16.168 %comms: 0.002823846458674577 %optimizer_step 0.05499965725251169 %forward: 22.964673280654175 %backward: 61.606166049279565 [2025-04-13 03:30:28,291] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23871.97 | forward: 145449.82 | backward_microstep: 390200.07 | backward: 390190.87 | backward_inner_microstep: 390174.47 | backward_inner: 390168.28 | backward_allreduce_microstep: 7.84 | backward_allreduce: 2.69 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.19 | step: 348.35 | _step_clipping: 0.11 | _step_step: 346.68 | _step_zero_grad: 0.47 | _step_check_overflow: 0.52 samples/sec: 16.168 | iteration 26240/ 143000 | elapsed time per iteration (ms): 63336.9 | learning rate: 5.566E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.319180E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 03:40:58,636] [INFO] [logging.py:60:log_dist] [Rank 0] step=26250, skipped=31, lr=[0.0005566013112711946, 0.0005566013112711946], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26250 loss: 2.3270 iter time (s): 63.034 samples/sec: 16.245 %comms: 0.003031838617390511 %optimizer_step 0.05689657047298131 %forward: 23.07382725797171 %backward: 61.912402304790426 [2025-04-13 03:40:58,637] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20757.86 | forward: 145443.76 | backward_microstep: 390268.53 | backward: 390259.18 | backward_inner_microstep: 390242.57 | backward_inner: 390236.27 | backward_allreduce_microstep: 8.08 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 19.11 | reduce_grads: 0.20 | step: 358.64 | _step_clipping: 0.11 | _step_step: 356.88 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.245 | iteration 26250/ 143000 | elapsed time per iteration (ms): 63034.6 | learning rate: 5.566E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.319895E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 03:51:39,781] [INFO] [logging.py:60:log_dist] [Rank 0] step=26260, skipped=31, lr=[0.0005565671603043868, 0.0005565671603043868], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26260 loss: 2.3203 iter time (s): 64.114 samples/sec: 15.972 %comms: 0.002893687448182649 %optimizer_step 0.05661222544001167 %forward: 22.7425256411136 %backward: 60.897939094082346 [2025-04-13 03:51:39,781] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30992.49 | forward: 145811.07 | backward_microstep: 390453.62 | backward: 390440.08 | backward_inner_microstep: 390421.85 | backward_inner: 390415.05 | backward_allreduce_microstep: 8.68 | backward_allreduce: 2.99 | reduce_tied_grads: 0.33 | comms: 18.55 | reduce_grads: 0.22 | step: 362.96 | _step_clipping: 0.15 | _step_step: 361.05 | _step_zero_grad: 0.52 | _step_check_overflow: 0.57 samples/sec: 15.971 | iteration 26260/ 143000 | elapsed time per iteration (ms): 64114.4 | learning rate: 5.566E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.319757E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 04:02:11,325] [INFO] [logging.py:60:log_dist] [Rank 0] step=26270, skipped=31, lr=[0.0005565329969544975, 0.0005565329969544975], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26270 loss: 2.3249 iter time (s): 63.154 samples/sec: 16.214 %comms: 0.0030408869285278397 %optimizer_step 0.056184245204123136 %forward: 23.042023573698128 %backward: 61.79649292886064 [2025-04-13 04:02:11,326] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21897.86 | forward: 145519.30 | backward_microstep: 390278.48 | backward: 390268.78 | backward_inner_microstep: 390251.87 | backward_inner: 390243.80 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.82 | reduce_tied_grads: 0.29 | comms: 19.20 | reduce_grads: 0.20 | step: 354.83 | _step_clipping: 0.11 | _step_step: 353.14 | _step_zero_grad: 0.48 | _step_check_overflow: 0.53 samples/sec: 16.214 | iteration 26270/ 143000 | elapsed time per iteration (ms): 63154.4 | learning rate: 5.565E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.334054E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 04:12:45,346] [INFO] [logging.py:60:log_dist] [Rank 0] step=26280, skipped=31, lr=[0.0005564988212231753, 0.0005564988212231753], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26280 loss: 2.3232 iter time (s): 63.402 samples/sec: 16.151 %comms: 0.0028532076709286576 %optimizer_step 0.05648297508931408 %forward: 22.952044997301382 %backward: 61.56484860243039 [2025-04-13 04:12:45,347] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24247.69 | forward: 145519.46 | backward_microstep: 390343.30 | backward: 390330.52 | backward_inner_microstep: 390312.47 | backward_inner: 390305.60 | backward_allreduce_microstep: 8.59 | backward_allreduce: 2.97 | reduce_tied_grads: 0.31 | comms: 18.09 | reduce_grads: 0.21 | step: 358.11 | _step_clipping: 0.14 | _step_step: 356.20 | _step_zero_grad: 0.54 | _step_check_overflow: 0.63 samples/sec: 16.151 | iteration 26280/ 143000 | elapsed time per iteration (ms): 63402.1 | learning rate: 5.565E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.313176E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 04:23:25,212] [INFO] [logging.py:60:log_dist] [Rank 0] step=26290, skipped=31, lr=[0.0005564646331120698, 0.0005564646331120698], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26290 loss: 2.3341 iter time (s): 63.986 samples/sec: 16.004 %comms: 0.0028162290444861075 %optimizer_step 0.05691904402431591 %forward: 22.731263659019202 %backward: 60.98543094116735 [2025-04-13 04:23:25,212] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30283.25 | forward: 145448.19 | backward_microstep: 390231.34 | backward: 390221.19 | backward_inner_microstep: 390203.58 | backward_inner: 390196.92 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.90 | reduce_tied_grads: 0.35 | comms: 18.02 | reduce_grads: 0.23 | step: 364.20 | _step_clipping: 0.14 | _step_step: 362.34 | _step_zero_grad: 0.54 | _step_check_overflow: 0.57 samples/sec: 16.003 | iteration 26290/ 143000 | elapsed time per iteration (ms): 63986.6 | learning rate: 5.565E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.319307E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 04:34:06,821] [INFO] [logging.py:60:log_dist] [Rank 0] step=26300, skipped=31, lr=[0.0005564304326228312, 0.0005564304326228312], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26300 loss: 2.3290 iter time (s): 64.160 samples/sec: 15.960 %comms: 0.0028033369409797486 %optimizer_step 0.061027166244229734 %forward: 22.744424785437204 %backward: 60.831315367363736 [2025-04-13 04:34:06,821] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31452.42 | forward: 145928.94 | backward_microstep: 390306.11 | backward: 390295.62 | backward_inner_microstep: 390277.26 | backward_inner: 390270.55 | backward_allreduce_microstep: 8.94 | backward_allreduce: 3.02 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.21 | step: 391.55 | _step_clipping: 0.11 | _step_step: 389.58 | _step_zero_grad: 0.57 | _step_check_overflow: 0.69 samples/sec: 15.960 | iteration 26300/ 143000 | elapsed time per iteration (ms): 64160.9 | learning rate: 5.564E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.320028E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 04:44:35,592] [INFO] [logging.py:60:log_dist] [Rank 0] step=26310, skipped=31, lr=[0.00055639621975711, 0.00055639621975711], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26310 loss: 2.3084 iter time (s): 62.877 samples/sec: 16.286 %comms: 0.002875701826673678 %optimizer_step 0.05638232070495654 %forward: 23.1032603109349 %backward: 62.06697678557119 [2025-04-13 04:44:35,593] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19350.01 | forward: 145265.38 | backward_microstep: 390264.63 | backward: 390255.86 | backward_inner_microstep: 390238.93 | backward_inner: 390232.65 | backward_allreduce_microstep: 8.31 | backward_allreduce: 2.98 | reduce_tied_grads: 0.28 | comms: 18.08 | reduce_grads: 0.18 | step: 354.51 | _step_clipping: 0.10 | _step_step: 352.78 | _step_zero_grad: 0.48 | _step_check_overflow: 0.59 samples/sec: 16.286 | iteration 26310/ 143000 | elapsed time per iteration (ms): 62877.1 | learning rate: 5.564E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.313324E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 04:55:21,209] [INFO] [logging.py:60:log_dist] [Rank 0] step=26320, skipped=31, lr=[0.0005563619945165575, 0.0005563619945165575], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26320 loss: 2.3168 iter time (s): 64.561 samples/sec: 15.861 %comms: 0.002865740181087784 %optimizer_step 0.05648120156627064 %forward: 22.552046993861094 %backward: 60.44465111367799 [2025-04-13 04:55:21,209] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35853.85 | forward: 145598.39 | backward_microstep: 390248.03 | backward: 390237.03 | backward_inner_microstep: 390217.95 | backward_inner: 390211.11 | backward_allreduce_microstep: 9.28 | backward_allreduce: 3.19 | reduce_tied_grads: 0.38 | comms: 18.50 | reduce_grads: 0.25 | step: 364.65 | _step_clipping: 0.14 | _step_step: 362.70 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 15.861 | iteration 26320/ 143000 | elapsed time per iteration (ms): 64561.6 | learning rate: 5.564E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.320496E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 05:06:02,916] [INFO] [logging.py:60:log_dist] [Rank 0] step=26330, skipped=31, lr=[0.0005563277569028256, 0.0005563277569028256], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26330 loss: 2.3063 iter time (s): 64.170 samples/sec: 15.958 %comms: 0.0028321849067762827 %optimizer_step 0.055965189440983644 %forward: 22.698540065006213 %backward: 60.82581079204928 [2025-04-13 05:06:02,917] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31803.69 | forward: 145656.86 | backward_microstep: 390333.45 | backward: 390320.11 | backward_inner_microstep: 390301.71 | backward_inner: 390294.89 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.00 | reduce_tied_grads: 0.34 | comms: 18.17 | reduce_grads: 0.22 | step: 359.13 | _step_clipping: 0.14 | _step_step: 357.35 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 15.957 | iteration 26330/ 143000 | elapsed time per iteration (ms): 64170.7 | learning rate: 5.563E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.315977E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 05:16:44,507] [INFO] [logging.py:60:log_dist] [Rank 0] step=26340, skipped=31, lr=[0.0005562935069175668, 0.0005562935069175668], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26340 loss: 2.3195 iter time (s): 64.158 samples/sec: 15.960 %comms: 0.0028596423902828294 %optimizer_step 0.05865070689644169 %forward: 22.685799083522685 %backward: 60.838584792787806 [2025-04-13 05:16:44,508] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31795.31 | forward: 145548.59 | backward_microstep: 390341.90 | backward: 390330.99 | backward_inner_microstep: 390311.98 | backward_inner: 390305.03 | backward_allreduce_microstep: 9.18 | backward_allreduce: 3.20 | reduce_tied_grads: 0.39 | comms: 18.35 | reduce_grads: 0.25 | step: 376.29 | _step_clipping: 0.15 | _step_step: 374.25 | _step_zero_grad: 0.60 | _step_check_overflow: 0.62 samples/sec: 15.960 | iteration 26340/ 143000 | elapsed time per iteration (ms): 64159.1 | learning rate: 5.563E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.318445E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 05:27:19,959] [INFO] [logging.py:60:log_dist] [Rank 0] step=26350, skipped=31, lr=[0.000556259244562434, 0.000556259244562434], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26350 loss: 2.3184 iter time (s): 63.545 samples/sec: 16.115 %comms: 0.0028334246211484137 %optimizer_step 0.055726819031967675 %forward: 22.8782024027819 %backward: 61.42447894365019 [2025-04-13 05:27:19,960] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25859.26 | forward: 145378.71 | backward_microstep: 390330.59 | backward: 390319.63 | backward_inner_microstep: 390302.60 | backward_inner: 390296.26 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.79 | reduce_tied_grads: 0.28 | comms: 18.00 | reduce_grads: 0.19 | step: 354.11 | _step_clipping: 0.13 | _step_step: 352.43 | _step_zero_grad: 0.48 | _step_check_overflow: 0.50 samples/sec: 16.115 | iteration 26350/ 143000 | elapsed time per iteration (ms): 63545.2 | learning rate: 5.563E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.324771E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 05:37:57,732] [INFO] [logging.py:60:log_dist] [Rank 0] step=26360, skipped=31, lr=[0.000556224969839081, 0.000556224969839081], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26360 loss: 2.3118 iter time (s): 63.777 samples/sec: 16.056 %comms: 0.0028202346596059814 %optimizer_step 0.056036056323859995 %forward: 22.80255517035875 %backward: 61.18749076241934 [2025-04-13 05:37:57,733] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28237.41 | forward: 145427.25 | backward_microstep: 390243.86 | backward: 390233.84 | backward_inner_microstep: 390215.24 | backward_inner: 390208.64 | backward_allreduce_microstep: 9.22 | backward_allreduce: 3.45 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 357.38 | _step_clipping: 0.13 | _step_step: 355.65 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.056 | iteration 26360/ 143000 | elapsed time per iteration (ms): 63777.3 | learning rate: 5.562E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.319016E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 05:48:32,171] [INFO] [logging.py:60:log_dist] [Rank 0] step=26370, skipped=31, lr=[0.000556190682749162, 0.000556190682749162], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26370 loss: 2.3073 iter time (s): 63.443 samples/sec: 16.141 %comms: 0.002894010990498379 %optimizer_step 0.05701020139080842 %forward: 22.943057458691356 %backward: 61.51566530001858 [2025-04-13 05:48:32,172] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24684.64 | forward: 145556.87 | backward_microstep: 390282.41 | backward: 390271.76 | backward_inner_microstep: 390253.56 | backward_inner: 390246.88 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.00 | reduce_tied_grads: 0.37 | comms: 18.36 | reduce_grads: 0.23 | step: 361.69 | _step_clipping: 0.14 | _step_step: 359.66 | _step_zero_grad: 0.54 | _step_check_overflow: 0.71 samples/sec: 16.140 | iteration 26370/ 143000 | elapsed time per iteration (ms): 63443.9 | learning rate: 5.562E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.314309E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 05:59:20,110] [INFO] [logging.py:60:log_dist] [Rank 0] step=26380, skipped=31, lr=[0.000556156383294332, 0.000556156383294332], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26380 loss: 2.3366 iter time (s): 64.793 samples/sec: 15.804 %comms: 0.00285690958386241 %optimizer_step 0.05863497827104999 %forward: 22.480733469454748 %backward: 60.234446110483596 [2025-04-13 05:59:20,110] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 38097.26 | forward: 145659.76 | backward_microstep: 390288.24 | backward: 390277.98 | backward_inner_microstep: 390257.78 | backward_inner: 390251.03 | backward_allreduce_microstep: 8.98 | backward_allreduce: 3.08 | reduce_tied_grads: 0.36 | comms: 18.51 | reduce_grads: 0.24 | step: 379.91 | _step_clipping: 0.15 | _step_step: 377.61 | _step_zero_grad: 0.65 | _step_check_overflow: 0.73 samples/sec: 15.804 | iteration 26380/ 143000 | elapsed time per iteration (ms): 64793.9 | learning rate: 5.562E-04 | approx flops per GPU: 68.2TFLOPS | lm_loss: 2.338090E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 06:10:00,150] [INFO] [logging.py:60:log_dist] [Rank 0] step=26390, skipped=31, lr=[0.0005561220714762461, 0.0005561220714762461], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26390 loss: 2.3292 iter time (s): 64.003 samples/sec: 15.999 %comms: 0.002851969319756675 %optimizer_step 0.05678019434530569 %forward: 22.760777868032957 %backward: 60.9804130978081 [2025-04-13 06:10:00,151] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30151.27 | forward: 145676.65 | backward_microstep: 390307.35 | backward: 390295.20 | backward_inner_microstep: 390276.92 | backward_inner: 390270.24 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.25 | reduce_grads: 0.23 | step: 363.41 | _step_clipping: 0.14 | _step_step: 361.50 | _step_zero_grad: 0.54 | _step_check_overflow: 0.58 samples/sec: 15.999 | iteration 26390/ 143000 | elapsed time per iteration (ms): 64004.0 | learning rate: 5.561E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.320447E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 06:20:39,338] [INFO] [logging.py:60:log_dist] [Rank 0] step=26400, skipped=31, lr=[0.0005560877472965606, 0.0005560877472965606], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26400 loss: 2.3056 iter time (s): 63.918 samples/sec: 16.020 %comms: 0.0028333512231214107 %optimizer_step 0.057084380529246345 %forward: 22.779471043532645 %backward: 61.05807176799496 [2025-04-13 06:20:39,338] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29398.16 | forward: 145602.31 | backward_microstep: 390282.36 | backward: 390272.30 | backward_inner_microstep: 390254.60 | backward_inner: 390248.02 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.94 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.21 | step: 364.87 | _step_clipping: 0.12 | _step_step: 362.83 | _step_zero_grad: 0.52 | _step_check_overflow: 0.81 samples/sec: 16.020 | iteration 26400/ 143000 | elapsed time per iteration (ms): 63918.8 | learning rate: 5.561E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.313464E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 06:31:23,678] [INFO] [logging.py:60:log_dist] [Rank 0] step=26410, skipped=31, lr=[0.0005560534107569322, 0.0005560534107569322], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26410 loss: 2.3313 iter time (s): 64.433 samples/sec: 15.892 %comms: 0.0028150615265742454 %optimizer_step 0.056576860717760205 %forward: 22.631419986235716 %backward: 60.57338171347503 [2025-04-13 06:31:23,679] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34271.11 | forward: 145822.02 | backward_microstep: 390305.79 | backward: 390295.11 | backward_inner_microstep: 390275.43 | backward_inner: 390268.33 | backward_allreduce_microstep: 9.61 | backward_allreduce: 3.30 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.22 | step: 364.54 | _step_clipping: 0.13 | _step_step: 362.68 | _step_zero_grad: 0.54 | _step_check_overflow: 0.57 samples/sec: 15.892 | iteration 26410/ 143000 | elapsed time per iteration (ms): 64434.0 | learning rate: 5.561E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.314204E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 06:42:02,844] [INFO] [logging.py:60:log_dist] [Rank 0] step=26420, skipped=31, lr=[0.0005560190618590178, 0.0005560190618590178], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26420 loss: 2.3175 iter time (s): 63.916 samples/sec: 16.021 %comms: 0.0028183798400921493 %optimizer_step 0.05529140064579319 %forward: 22.75712320698842 %backward: 61.057870964982854 [2025-04-13 06:42:02,844] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29555.76 | forward: 145454.41 | backward_microstep: 390268.03 | backward: 390257.44 | backward_inner_microstep: 390239.90 | backward_inner: 390233.36 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.86 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.23 | step: 353.40 | _step_clipping: 0.13 | _step_step: 351.70 | _step_zero_grad: 0.49 | _step_check_overflow: 0.47 samples/sec: 16.021 | iteration 26420/ 143000 | elapsed time per iteration (ms): 63916.5 | learning rate: 5.560E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.318941E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 06:52:41,488] [INFO] [logging.py:60:log_dist] [Rank 0] step=26430, skipped=31, lr=[0.0005559847006044756, 0.0005559847006044756], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26430 loss: 2.3217 iter time (s): 63.864 samples/sec: 16.034 %comms: 0.0028451332458112244 %optimizer_step 0.05562530234207276 %forward: 22.761832809622703 %backward: 61.09878529273956 [2025-04-13 06:52:41,489] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29198.60 | forward: 145365.84 | backward_microstep: 390210.66 | backward: 390200.41 | backward_inner_microstep: 390183.01 | backward_inner: 390176.32 | backward_allreduce_microstep: 8.32 | backward_allreduce: 2.86 | reduce_tied_grads: 0.31 | comms: 18.17 | reduce_grads: 0.22 | step: 355.24 | _step_clipping: 0.13 | _step_step: 353.55 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 16.034 | iteration 26430/ 143000 | elapsed time per iteration (ms): 63864.4 | learning rate: 5.560E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.317188E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 07:03:24,220] [INFO] [logging.py:60:log_dist] [Rank 0] step=26440, skipped=31, lr=[0.0005559503269949638, 0.0005559503269949638], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26440 loss: 2.3273 iter time (s): 64.273 samples/sec: 15.932 %comms: 0.0028400950027525195 %optimizer_step 0.059064205266874226 %forward: 22.65603439675511 %backward: 60.727880712157955 [2025-04-13 07:03:24,221] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32869.95 | forward: 145616.32 | backward_microstep: 390325.51 | backward: 390314.15 | backward_inner_microstep: 390295.18 | backward_inner: 390288.20 | backward_allreduce_microstep: 9.05 | backward_allreduce: 3.11 | reduce_tied_grads: 0.35 | comms: 18.25 | reduce_grads: 0.24 | step: 379.62 | _step_clipping: 0.14 | _step_step: 377.75 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 15.932 | iteration 26440/ 143000 | elapsed time per iteration (ms): 64273.3 | learning rate: 5.560E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.312885E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 07:14:05,063] [INFO] [logging.py:60:log_dist] [Rank 0] step=26450, skipped=31, lr=[0.0005559159410321416, 0.0005559159410321416], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26450 loss: 2.3269 iter time (s): 64.084 samples/sec: 15.979 %comms: 0.002810595941117373 %optimizer_step 0.05729087961245065 %forward: 22.731286537782115 %backward: 60.88576471476214 [2025-04-13 07:14:05,064] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31119.53 | forward: 145670.44 | backward_microstep: 390187.77 | backward: 390178.35 | backward_inner_microstep: 390160.38 | backward_inner: 390153.64 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.95 | reduce_tied_grads: 0.32 | comms: 18.01 | reduce_grads: 0.21 | step: 367.14 | _step_clipping: 0.11 | _step_step: 365.36 | _step_zero_grad: 0.58 | _step_check_overflow: 0.48 samples/sec: 15.979 | iteration 26450/ 143000 | elapsed time per iteration (ms): 64084.3 | learning rate: 5.559E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.333944E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 07:24:52,010] [INFO] [logging.py:60:log_dist] [Rank 0] step=26460, skipped=31, lr=[0.0005558815427176685, 0.0005558815427176685], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26460 loss: 2.3265 iter time (s): 64.694 samples/sec: 15.828 %comms: 0.0027966430365624164 %optimizer_step 0.0573674458126107 %forward: 22.511209148425458 %backward: 60.33255274244846 [2025-04-13 07:24:52,011] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 37092.15 | forward: 145634.26 | backward_microstep: 390327.79 | backward: 390316.06 | backward_inner_microstep: 390297.01 | backward_inner: 390290.03 | backward_allreduce_microstep: 9.21 | backward_allreduce: 3.17 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.23 | step: 371.13 | _step_clipping: 0.13 | _step_step: 369.06 | _step_zero_grad: 0.57 | _step_check_overflow: 0.72 samples/sec: 15.828 | iteration 26460/ 143000 | elapsed time per iteration (ms): 64694.7 | learning rate: 5.559E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.319807E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 07:35:28,765] [INFO] [logging.py:60:log_dist] [Rank 0] step=26470, skipped=31, lr=[0.0005558471320532047, 0.0005558471320532047], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26470 loss: 2.3235 iter time (s): 63.675 samples/sec: 16.082 %comms: 0.0028664238853488137 %optimizer_step 0.05845173514795361 %forward: 22.83942737994391 %backward: 61.28066554688695 [2025-04-13 07:35:28,766] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27226.41 | forward: 145429.59 | backward_microstep: 390213.00 | backward: 390203.39 | backward_inner_microstep: 390185.42 | backward_inner: 390178.60 | backward_allreduce_microstep: 8.78 | backward_allreduce: 3.05 | reduce_tied_grads: 0.38 | comms: 18.25 | reduce_grads: 0.24 | step: 372.19 | _step_clipping: 0.13 | _step_step: 370.22 | _step_zero_grad: 0.60 | _step_check_overflow: 0.55 samples/sec: 16.082 | iteration 26470/ 143000 | elapsed time per iteration (ms): 63675.5 | learning rate: 5.558E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.314812E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 07:46:03,890] [INFO] [logging.py:60:log_dist] [Rank 0] step=26480, skipped=31, lr=[0.0005558127090404111, 0.0005558127090404111], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26480 loss: 2.2928 iter time (s): 63.512 samples/sec: 16.123 %comms: 0.0028514783395415634 %optimizer_step 0.056108654907924035 %forward: 22.8927942485799 %backward: 61.43804874199126 [2025-04-13 07:46:03,890] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25631.25 | forward: 145396.44 | backward_microstep: 390215.28 | backward: 390204.60 | backward_inner_microstep: 390186.40 | backward_inner: 390179.81 | backward_allreduce_microstep: 8.93 | backward_allreduce: 3.05 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.23 | step: 356.36 | _step_clipping: 0.13 | _step_step: 354.58 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.123 | iteration 26480/ 143000 | elapsed time per iteration (ms): 63512.5 | learning rate: 5.558E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.314477E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 07:56:42,294] [INFO] [logging.py:60:log_dist] [Rank 0] step=26490, skipped=31, lr=[0.0005557782736809489, 0.0005557782736809489], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26490 loss: 2.3223 iter time (s): 63.840 samples/sec: 16.040 %comms: 0.0028285004079591884 %optimizer_step 0.056696649627467295 %forward: 22.79861374596344 %backward: 61.13287643446886 [2025-04-13 07:56:42,295] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28713.41 | forward: 145546.04 | backward_microstep: 390283.36 | backward: 390271.45 | backward_inner_microstep: 390252.55 | backward_inner: 390245.41 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.05 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.22 | step: 361.95 | _step_clipping: 0.14 | _step_step: 360.15 | _step_zero_grad: 0.54 | _step_check_overflow: 0.51 samples/sec: 16.040 | iteration 26490/ 143000 | elapsed time per iteration (ms): 63840.4 | learning rate: 5.558E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.336914E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 08:07:16,500] [INFO] [logging.py:60:log_dist] [Rank 0] step=26500, skipped=31, lr=[0.0005557438259764805, 0.0005557438259764805], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26500 loss: 2.3063 iter time (s): 63.420 samples/sec: 16.146 %comms: 0.002848168614216523 %optimizer_step 0.05663514229870806 %forward: 22.995953392693906 %backward: 61.543665682531966 [2025-04-13 08:07:16,500] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24171.08 | forward: 145840.20 | backward_microstep: 390320.02 | backward: 390309.56 | backward_inner_microstep: 390290.76 | backward_inner: 390283.84 | backward_allreduce_microstep: 9.10 | backward_allreduce: 3.16 | reduce_tied_grads: 0.36 | comms: 18.06 | reduce_grads: 0.23 | step: 359.18 | _step_clipping: 0.15 | _step_step: 357.36 | _step_zero_grad: 0.50 | _step_check_overflow: 0.55 samples/sec: 16.146 | iteration 26500/ 143000 | elapsed time per iteration (ms): 63420.6 | learning rate: 5.557E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.313674E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 08:17:49,409] [INFO] [logging.py:60:log_dist] [Rank 0] step=26510, skipped=31, lr=[0.000555709365928668, 0.000555709365928668], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26510 loss: 2.3104 iter time (s): 63.290 samples/sec: 16.179 %comms: 0.002840893659859062 %optimizer_step 0.05550215074700094 %forward: 22.974117129583 %backward: 61.65181442471188 [2025-04-13 08:17:49,409] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23438.35 | forward: 145403.86 | backward_microstep: 390205.17 | backward: 390196.13 | backward_inner_microstep: 390179.36 | backward_inner: 390173.02 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.80 | reduce_tied_grads: 0.29 | comms: 17.98 | reduce_grads: 0.19 | step: 351.27 | _step_clipping: 0.12 | _step_step: 349.52 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.179 | iteration 26510/ 143000 | elapsed time per iteration (ms): 63290.9 | learning rate: 5.557E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.314828E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 08:28:19,194] [INFO] [logging.py:60:log_dist] [Rank 0] step=26520, skipped=31, lr=[0.0005556748935391751, 0.0005556748935391751], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26520 loss: 2.3204 iter time (s): 62.978 samples/sec: 16.260 %comms: 0.0029023406769000844 %optimizer_step 0.057510373605937845 %forward: 23.11132256931339 %backward: 61.97910271933724 [2025-04-13 08:28:19,195] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19976.00 | forward: 145550.48 | backward_microstep: 390343.89 | backward: 390331.98 | backward_inner_microstep: 390313.64 | backward_inner: 390306.84 | backward_allreduce_microstep: 8.90 | backward_allreduce: 3.19 | reduce_tied_grads: 0.31 | comms: 18.28 | reduce_grads: 0.20 | step: 362.19 | _step_clipping: 0.12 | _step_step: 360.33 | _step_zero_grad: 0.54 | _step_check_overflow: 0.56 samples/sec: 16.259 | iteration 26520/ 143000 | elapsed time per iteration (ms): 62978.6 | learning rate: 5.557E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.315045E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 08:38:56,347] [INFO] [logging.py:60:log_dist] [Rank 0] step=26530, skipped=31, lr=[0.0005556404088096652, 0.0005556404088096652], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26530 loss: 2.3042 iter time (s): 63.715 samples/sec: 16.072 %comms: 0.0028321472876805774 %optimizer_step 0.05750416011063885 %forward: 22.83552249208222 %backward: 61.251360431487925 [2025-04-13 08:38:56,348] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27501.26 | forward: 145495.90 | backward_microstep: 390271.48 | backward: 390261.36 | backward_inner_microstep: 390243.49 | backward_inner: 390236.78 | backward_allreduce_microstep: 8.66 | backward_allreduce: 2.99 | reduce_tied_grads: 0.35 | comms: 18.04 | reduce_grads: 0.24 | step: 366.39 | _step_clipping: 0.14 | _step_step: 364.52 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.071 | iteration 26530/ 143000 | elapsed time per iteration (ms): 63715.3 | learning rate: 5.556E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.319592E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 08:49:36,662] [INFO] [logging.py:60:log_dist] [Rank 0] step=26540, skipped=31, lr=[0.000555605911741803, 0.000555605911741803], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26540 loss: 2.3116 iter time (s): 64.031 samples/sec: 15.992 %comms: 0.0028151474027321415 %optimizer_step 0.05571169449194217 %forward: 22.705567076100877 %backward: 60.93015711859875 [2025-04-13 08:49:36,662] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30924.94 | forward: 145385.75 | backward_microstep: 390149.81 | backward: 390141.16 | backward_inner_microstep: 390123.29 | backward_inner: 390116.72 | backward_allreduce_microstep: 8.72 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.22 | step: 356.73 | _step_clipping: 0.13 | _step_step: 354.93 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 15.992 | iteration 26540/ 143000 | elapsed time per iteration (ms): 64031.5 | learning rate: 5.556E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.316945E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 09:00:19,177] [INFO] [logging.py:60:log_dist] [Rank 0] step=26550, skipped=31, lr=[0.0005555714023372533, 0.0005555714023372533], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26550 loss: 2.2895 iter time (s): 64.251 samples/sec: 15.938 %comms: 0.0028528175995328696 %optimizer_step 0.056625906046919354 %forward: 22.650904261153123 %backward: 60.73551874729253 [2025-04-13 09:00:19,178] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32840.90 | forward: 145534.18 | backward_microstep: 390240.97 | backward: 390231.40 | backward_inner_microstep: 390213.34 | backward_inner: 390206.77 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.03 | reduce_tied_grads: 0.33 | comms: 18.33 | reduce_grads: 0.24 | step: 363.83 | _step_clipping: 0.13 | _step_step: 361.85 | _step_zero_grad: 0.58 | _step_check_overflow: 0.64 samples/sec: 15.937 | iteration 26550/ 143000 | elapsed time per iteration (ms): 64251.5 | learning rate: 5.556E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.305211E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 09:10:52,751] [INFO] [logging.py:60:log_dist] [Rank 0] step=26560, skipped=31, lr=[0.0005555368805976816, 0.0005555368805976816], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26560 loss: 2.3294 iter time (s): 63.357 samples/sec: 16.162 %comms: 0.002869897949583904 %optimizer_step 0.057461706069284746 %forward: 22.977164750656804 %backward: 61.59767426576989 [2025-04-13 09:10:52,752] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23840.49 | forward: 145575.96 | backward_microstep: 390273.82 | backward: 390263.14 | backward_inner_microstep: 390244.61 | backward_inner: 390237.61 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.09 | reduce_tied_grads: 0.35 | comms: 18.18 | reduce_grads: 0.23 | step: 364.06 | _step_clipping: 0.13 | _step_step: 362.32 | _step_zero_grad: 0.53 | _step_check_overflow: 0.47 samples/sec: 16.162 | iteration 26560/ 143000 | elapsed time per iteration (ms): 63357.4 | learning rate: 5.555E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.328897E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 09:21:26,488] [INFO] [logging.py:60:log_dist] [Rank 0] step=26570, skipped=31, lr=[0.0005555023465247542, 0.0005555023465247542], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26570 loss: 2.3182 iter time (s): 63.373 samples/sec: 16.158 %comms: 0.0028407909458590874 %optimizer_step 0.05682669149807638 %forward: 22.977864462520184 %backward: 61.564390444491046 [2025-04-13 09:21:26,489] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24113.58 | forward: 145617.96 | backward_microstep: 390161.36 | backward: 390152.93 | backward_inner_microstep: 390135.05 | backward_inner: 390126.93 | backward_allreduce_microstep: 8.74 | backward_allreduce: 2.95 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.23 | step: 360.13 | _step_clipping: 0.14 | _step_step: 358.16 | _step_zero_grad: 0.56 | _step_check_overflow: 0.65 samples/sec: 16.158 | iteration 26570/ 143000 | elapsed time per iteration (ms): 63373.7 | learning rate: 5.555E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.328250E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 09:32:04,937] [INFO] [logging.py:60:log_dist] [Rank 0] step=26580, skipped=31, lr=[0.000555467800120138, 0.000555467800120138], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26580 loss: 2.3225 iter time (s): 63.844 samples/sec: 16.039 %comms: 0.0028123584224609933 %optimizer_step 0.05478747563263302 %forward: 22.80560038230777 %backward: 61.11469261194865 [2025-04-13 09:32:04,938] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28787.61 | forward: 145600.74 | backward_microstep: 390191.59 | backward: 390182.43 | backward_inner_microstep: 390165.12 | backward_inner: 390158.61 | backward_allreduce_microstep: 8.40 | backward_allreduce: 2.98 | reduce_tied_grads: 0.31 | comms: 17.96 | reduce_grads: 0.20 | step: 349.79 | _step_clipping: 0.12 | _step_step: 348.07 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.039 | iteration 26580/ 143000 | elapsed time per iteration (ms): 63844.9 | learning rate: 5.555E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.315501E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 09:42:41,066] [INFO] [logging.py:60:log_dist] [Rank 0] step=26590, skipped=31, lr=[0.0005554332413855001, 0.0005554332413855001], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26590 loss: 2.3422 iter time (s): 63.612 samples/sec: 16.098 %comms: 0.0028446510660199813 %optimizer_step 0.056101370422925734 %forward: 22.85395683574117 %backward: 61.351563531567486 [2025-04-13 09:42:41,067] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26578.76 | forward: 145379.39 | backward_microstep: 390284.24 | backward: 390271.70 | backward_inner_microstep: 390253.96 | backward_inner: 390247.25 | backward_allreduce_microstep: 8.57 | backward_allreduce: 2.94 | reduce_tied_grads: 0.32 | comms: 18.10 | reduce_grads: 0.22 | step: 356.87 | _step_clipping: 0.11 | _step_step: 355.16 | _step_zero_grad: 0.51 | _step_check_overflow: 0.47 samples/sec: 16.097 | iteration 26590/ 143000 | elapsed time per iteration (ms): 63612.9 | learning rate: 5.554E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.332548E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 09:53:11,672] [INFO] [logging.py:60:log_dist] [Rank 0] step=26600, skipped=31, lr=[0.0005553986703225085, 0.0005553986703225085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26600 loss: 2.3317 iter time (s): 63.060 samples/sec: 16.238 %comms: 0.0028711537612671942 %optimizer_step 0.05677046810033423 %forward: 23.058510722260202 %backward: 61.87258149294892 [2025-04-13 09:53:11,673] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21153.64 | forward: 145407.06 | backward_microstep: 390177.31 | backward: 390168.74 | backward_inner_microstep: 390150.82 | backward_inner: 390144.22 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.03 | reduce_tied_grads: 0.32 | comms: 18.11 | reduce_grads: 0.23 | step: 357.99 | _step_clipping: 0.13 | _step_step: 356.19 | _step_zero_grad: 0.51 | _step_check_overflow: 0.57 samples/sec: 16.238 | iteration 26600/ 143000 | elapsed time per iteration (ms): 63060.6 | learning rate: 5.554E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.320424E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 10:03:52,168] [INFO] [logging.py:60:log_dist] [Rank 0] step=26610, skipped=31, lr=[0.000555364086932832, 0.000555364086932832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26610 loss: 2.3081 iter time (s): 64.049 samples/sec: 15.988 %comms: 0.0028500462956824045 %optimizer_step 0.05637336034106946 %forward: 22.728925566919507 %backward: 60.912786096020035 [2025-04-13 10:03:52,169] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30880.37 | forward: 145576.64 | backward_microstep: 390149.82 | backward: 390140.70 | backward_inner_microstep: 390123.66 | backward_inner: 390117.33 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.83 | reduce_tied_grads: 0.34 | comms: 18.25 | reduce_grads: 0.22 | step: 361.07 | _step_clipping: 0.13 | _step_step: 359.10 | _step_zero_grad: 0.50 | _step_check_overflow: 0.72 samples/sec: 15.988 | iteration 26610/ 143000 | elapsed time per iteration (ms): 64049.6 | learning rate: 5.554E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.316300E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 10:14:24,885] [INFO] [logging.py:60:log_dist] [Rank 0] step=26620, skipped=31, lr=[0.0005553294912181393, 0.0005553294912181393], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26620 loss: 2.3054 iter time (s): 63.271 samples/sec: 16.184 %comms: 0.0028869755993880452 %optimizer_step 0.05675001800148777 %forward: 22.971628707206914 %backward: 61.674068062089304 [2025-04-13 10:14:24,886] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23241.80 | forward: 145343.93 | backward_microstep: 390229.79 | backward: 390218.37 | backward_inner_microstep: 390199.96 | backward_inner: 390193.18 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.03 | reduce_tied_grads: 0.53 | comms: 18.27 | reduce_grads: 0.20 | step: 359.06 | _step_clipping: 0.12 | _step_step: 357.31 | _step_zero_grad: 0.49 | _step_check_overflow: 0.56 samples/sec: 16.184 | iteration 26620/ 143000 | elapsed time per iteration (ms): 63271.6 | learning rate: 5.553E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.321395E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 10:25:01,591] [INFO] [logging.py:60:log_dist] [Rank 0] step=26630, skipped=31, lr=[0.0005552948831801007, 0.0005552948831801007], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26630 loss: 2.3344 iter time (s): 63.670 samples/sec: 16.083 %comms: 0.0030896286036125814 %optimizer_step 0.05687033825150153 %forward: 22.842357982621923 %backward: 61.286929930888476 [2025-04-13 10:25:01,592] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27164.35 | forward: 145437.39 | backward_microstep: 390224.46 | backward: 390214.15 | backward_inner_microstep: 390196.09 | backward_inner: 390189.56 | backward_allreduce_microstep: 8.62 | backward_allreduce: 3.11 | reduce_tied_grads: 0.32 | comms: 19.67 | reduce_grads: 0.23 | step: 362.09 | _step_clipping: 0.12 | _step_step: 356.85 | _step_zero_grad: 2.22 | _step_check_overflow: 0.61 samples/sec: 16.083 | iteration 26630/ 143000 | elapsed time per iteration (ms): 63670.6 | learning rate: 5.553E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.324494E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 10:35:31,300] [INFO] [logging.py:60:log_dist] [Rank 0] step=26640, skipped=31, lr=[0.0005552602628203861, 0.0005552602628203861], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26640 loss: 2.3235 iter time (s): 62.970 samples/sec: 16.262 %comms: 0.002846810656049403 %optimizer_step 0.056045470226492616 %forward: 23.069789007985896 %backward: 61.96049233738273 [2025-04-13 10:35:31,300] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20424.62 | forward: 145271.17 | backward_microstep: 390176.52 | backward: 390167.13 | backward_inner_microstep: 390150.13 | backward_inner: 390143.88 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.77 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.20 | step: 352.92 | _step_clipping: 0.12 | _step_step: 351.20 | _step_zero_grad: 0.46 | _step_check_overflow: 0.56 samples/sec: 16.261 | iteration 26640/ 143000 | elapsed time per iteration (ms): 62970.9 | learning rate: 5.553E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.320102E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 10:46:21,695] [INFO] [logging.py:60:log_dist] [Rank 0] step=26650, skipped=31, lr=[0.0005552256301406667, 0.0005552256301406667], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26650 loss: 2.3165 iter time (s): 65.039 samples/sec: 15.744 %comms: 0.002839922605494086 %optimizer_step 0.05966748107193196 %forward: 22.37364883238661 %backward: 60.01597620784537 [2025-04-13 10:46:21,695] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40605.86 | forward: 145515.61 | backward_microstep: 390348.91 | backward: 390336.92 | backward_inner_microstep: 390317.61 | backward_inner: 390310.57 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.30 | reduce_tied_grads: 0.38 | comms: 18.47 | reduce_grads: 0.28 | step: 388.07 | _step_clipping: 0.15 | _step_step: 385.90 | _step_zero_grad: 0.65 | _step_check_overflow: 0.65 samples/sec: 15.744 | iteration 26650/ 143000 | elapsed time per iteration (ms): 65039.5 | learning rate: 5.552E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.316339E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 10:57:07,030] [INFO] [logging.py:60:log_dist] [Rank 0] step=26660, skipped=31, lr=[0.0005551909851426139, 0.0005551909851426139], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26660 loss: 2.3109 iter time (s): 64.533 samples/sec: 15.868 %comms: 0.0028247246123302566 %optimizer_step 0.05600803385861512 %forward: 22.53218723210037 %backward: 60.46021485388078 [2025-04-13 10:57:07,031] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35871.92 | forward: 145406.76 | backward_microstep: 390177.91 | backward: 390167.35 | backward_inner_microstep: 390148.20 | backward_inner: 390141.32 | backward_allreduce_microstep: 9.43 | backward_allreduce: 3.23 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.22 | step: 361.44 | _step_clipping: 0.15 | _step_step: 359.48 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 15.868 | iteration 26660/ 143000 | elapsed time per iteration (ms): 64533.5 | learning rate: 5.552E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.312826E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 11:07:39,951] [INFO] [logging.py:60:log_dist] [Rank 0] step=26670, skipped=31, lr=[0.0005551563278278997, 0.0005551563278278997], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26670 loss: 2.2778 iter time (s): 63.291 samples/sec: 16.179 %comms: 0.0028459655243714323 %optimizer_step 0.057812163893896575 %forward: 22.963103890750208 %backward: 61.64799555508327 [2025-04-13 11:07:39,952] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23492.55 | forward: 145336.78 | backward_microstep: 390188.62 | backward: 390179.00 | backward_inner_microstep: 390157.68 | backward_inner: 390149.40 | backward_allreduce_microstep: 8.81 | backward_allreduce: 2.96 | reduce_tied_grads: 0.34 | comms: 18.01 | reduce_grads: 0.21 | step: 365.90 | _step_clipping: 0.12 | _step_step: 364.05 | _step_zero_grad: 0.55 | _step_check_overflow: 0.57 samples/sec: 16.179 | iteration 26670/ 143000 | elapsed time per iteration (ms): 63292.1 | learning rate: 5.552E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.309603E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 11:18:18,806] [INFO] [logging.py:60:log_dist] [Rank 0] step=26680, skipped=31, lr=[0.0005551216581981971, 0.0005551216581981971], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26680 loss: 2.3021 iter time (s): 63.885 samples/sec: 16.029 %comms: 0.0028174735019947947 %optimizer_step 0.05548700480867396 %forward: 22.767691747608655 %backward: 61.08088473389689 [2025-04-13 11:18:18,807] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29275.27 | forward: 145451.25 | backward_microstep: 390225.07 | backward: 390214.82 | backward_inner_microstep: 390195.69 | backward_inner: 390189.19 | backward_allreduce_microstep: 10.17 | backward_allreduce: 2.85 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.19 | step: 354.48 | _step_clipping: 0.13 | _step_step: 352.61 | _step_zero_grad: 0.52 | _step_check_overflow: 0.63 samples/sec: 16.029 | iteration 26680/ 143000 | elapsed time per iteration (ms): 63885.5 | learning rate: 5.551E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.306746E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 11:28:50,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=26690, skipped=31, lr=[0.0005550869762551793, 0.0005550869762551793], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26690 loss: 2.3098 iter time (s): 63.131 samples/sec: 16.220 %comms: 0.002859973510678358 %optimizer_step 0.057184968301140116 %forward: 23.01596285585592 %backward: 61.81156304647344 [2025-04-13 11:28:50,128] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21882.86 | forward: 145303.20 | backward_microstep: 390236.09 | backward: 390225.61 | backward_inner_microstep: 390206.31 | backward_inner: 390199.31 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.25 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.21 | step: 361.02 | _step_clipping: 0.12 | _step_step: 359.18 | _step_zero_grad: 0.52 | _step_check_overflow: 0.59 samples/sec: 16.220 | iteration 26690/ 143000 | elapsed time per iteration (ms): 63132.1 | learning rate: 5.551E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.311942E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 11:39:27,611] [INFO] [logging.py:60:log_dist] [Rank 0] step=26700, skipped=31, lr=[0.0005550522820005202, 0.0005550522820005202], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26700 loss: 2.3259 iter time (s): 63.748 samples/sec: 16.063 %comms: 0.0028478072247899878 %optimizer_step 0.05592509346652614 %forward: 22.82624189939395 %backward: 61.22839662729505 [2025-04-13 11:39:27,612] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27729.85 | forward: 145512.27 | backward_microstep: 390329.01 | backward: 390317.55 | backward_inner_microstep: 390299.75 | backward_inner: 390293.13 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.93 | reduce_tied_grads: 0.35 | comms: 18.15 | reduce_grads: 0.24 | step: 356.51 | _step_clipping: 0.14 | _step_step: 354.61 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.063 | iteration 26700/ 143000 | elapsed time per iteration (ms): 63748.4 | learning rate: 5.551E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.316346E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 11:50:05,502] [INFO] [logging.py:60:log_dist] [Rank 0] step=26710, skipped=31, lr=[0.0005550175754358941, 0.0005550175754358941], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26710 loss: 2.2957 iter time (s): 63.788 samples/sec: 16.053 %comms: 0.002848496878311523 %optimizer_step 0.05801498319898151 %forward: 22.8406281705908 %backward: 61.198110566167315 [2025-04-13 11:50:05,502] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27885.34 | forward: 145696.81 | backward_microstep: 390385.14 | backward: 390373.22 | backward_inner_microstep: 390355.02 | backward_inner: 390348.05 | backward_allreduce_microstep: 8.72 | backward_allreduce: 3.01 | reduce_tied_grads: 0.37 | comms: 18.17 | reduce_grads: 0.23 | step: 370.07 | _step_clipping: 0.13 | _step_step: 368.25 | _step_zero_grad: 0.56 | _step_check_overflow: 0.49 samples/sec: 16.053 | iteration 26710/ 143000 | elapsed time per iteration (ms): 63789.1 | learning rate: 5.550E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.318068E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 12:00:46,329] [INFO] [logging.py:60:log_dist] [Rank 0] step=26720, skipped=31, lr=[0.0005549828565629765, 0.0005549828565629765], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26720 loss: 2.3229 iter time (s): 64.082 samples/sec: 15.979 %comms: 0.0028072032772100347 %optimizer_step 0.055854609446327946 %forward: 22.705275144475006 %backward: 60.91006771857034 [2025-04-13 12:00:46,330] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31094.10 | forward: 145500.26 | backward_microstep: 390336.01 | backward: 390324.73 | backward_inner_microstep: 390305.68 | backward_inner: 390298.77 | backward_allreduce_microstep: 9.33 | backward_allreduce: 3.20 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.24 | step: 357.93 | _step_clipping: 0.11 | _step_step: 356.10 | _step_zero_grad: 0.54 | _step_check_overflow: 0.57 samples/sec: 15.979 | iteration 26720/ 143000 | elapsed time per iteration (ms): 64082.8 | learning rate: 5.550E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.316122E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 12:11:29,838] [INFO] [logging.py:60:log_dist] [Rank 0] step=26730, skipped=31, lr=[0.0005549481253834428, 0.0005549481253834428], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26730 loss: 2.2959 iter time (s): 64.350 samples/sec: 15.913 %comms: 0.00280532940086348 %optimizer_step 0.0567011695012832 %forward: 22.675015308981944 %backward: 60.62318391374052 [2025-04-13 12:11:29,838] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33586.67 | forward: 145914.09 | backward_microstep: 390121.88 | backward: 390111.16 | backward_inner_microstep: 390093.17 | backward_inner: 390086.51 | backward_allreduce_microstep: 8.72 | backward_allreduce: 2.98 | reduce_tied_grads: 0.34 | comms: 18.05 | reduce_grads: 0.24 | step: 364.87 | _step_clipping: 0.12 | _step_step: 363.08 | _step_zero_grad: 0.57 | _step_check_overflow: 0.50 samples/sec: 15.913 | iteration 26730/ 143000 | elapsed time per iteration (ms): 64350.8 | learning rate: 5.549E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.314514E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 12:22:03,462] [INFO] [logging.py:60:log_dist] [Rank 0] step=26740, skipped=31, lr=[0.0005549133818989692, 0.0005549133818989692], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26740 loss: 2.3334 iter time (s): 63.362 samples/sec: 16.161 %comms: 0.002897585501935193 %optimizer_step 0.05848399641262765 %forward: 22.981370763715162 %backward: 61.61458833690349 [2025-04-13 12:22:03,463] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23667.77 | forward: 145614.40 | backward_microstep: 390415.19 | backward: 390401.93 | backward_inner_microstep: 390382.05 | backward_inner: 390375.27 | backward_allreduce_microstep: 10.30 | backward_allreduce: 2.98 | reduce_tied_grads: 0.34 | comms: 18.36 | reduce_grads: 0.23 | step: 370.57 | _step_clipping: 0.12 | _step_step: 368.72 | _step_zero_grad: 0.49 | _step_check_overflow: 0.63 samples/sec: 16.161 | iteration 26740/ 143000 | elapsed time per iteration (ms): 63362.5 | learning rate: 5.549E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.324437E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 12:32:31,889] [INFO] [logging.py:60:log_dist] [Rank 0] step=26750, skipped=31, lr=[0.0005548786261112328, 0.0005548786261112328], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26750 loss: 2.2910 iter time (s): 62.842 samples/sec: 16.295 %comms: 0.0029013339809930167 %optimizer_step 0.05686311846934872 %forward: 23.142531606649154 %backward: 62.126288057384805 [2025-04-13 12:32:31,889] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 18665.13 | forward: 145432.46 | backward_microstep: 390427.15 | backward: 390414.47 | backward_inner_microstep: 390396.92 | backward_inner: 390390.33 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.87 | reduce_tied_grads: 0.32 | comms: 18.23 | reduce_grads: 0.22 | step: 357.34 | _step_clipping: 0.12 | _step_step: 355.45 | _step_zero_grad: 0.51 | _step_check_overflow: 0.65 samples/sec: 16.295 | iteration 26750/ 143000 | elapsed time per iteration (ms): 62842.6 | learning rate: 5.549E-04 | approx flops per GPU: 70.3TFLOPS | lm_loss: 2.307324E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 12:43:18,104] [INFO] [logging.py:60:log_dist] [Rank 0] step=26760, skipped=31, lr=[0.000554843858021911, 0.000554843858021911], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26760 loss: 2.2947 iter time (s): 64.621 samples/sec: 15.846 %comms: 0.0027925049555128905 %optimizer_step 0.05621812623386051 %forward: 22.550791455611492 %backward: 60.39785927310961 [2025-04-13 12:43:18,105] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36265.89 | forward: 145725.30 | backward_microstep: 390307.50 | backward: 390296.56 | backward_inner_microstep: 390278.11 | backward_inner: 390271.38 | backward_allreduce_microstep: 8.86 | backward_allreduce: 3.06 | reduce_tied_grads: 0.33 | comms: 18.05 | reduce_grads: 0.22 | step: 363.29 | _step_clipping: 0.12 | _step_step: 361.38 | _step_zero_grad: 0.59 | _step_check_overflow: 0.57 samples/sec: 15.846 | iteration 26760/ 143000 | elapsed time per iteration (ms): 64621.5 | learning rate: 5.548E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.308722E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 12:53:56,092] [INFO] [logging.py:60:log_dist] [Rank 0] step=26770, skipped=31, lr=[0.0005548090776326819, 0.0005548090776326819], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26770 loss: 2.3141 iter time (s): 63.798 samples/sec: 16.051 %comms: 0.0028669330792271044 %optimizer_step 0.056558883889782814 %forward: 22.830673214618848 %backward: 61.193222876640704 [2025-04-13 12:53:56,092] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27977.52 | forward: 145655.61 | backward_microstep: 390413.83 | backward: 390401.81 | backward_inner_microstep: 390383.45 | backward_inner: 390376.66 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.20 | reduce_tied_grads: 0.30 | comms: 18.29 | reduce_grads: 0.21 | step: 360.84 | _step_clipping: 0.13 | _step_step: 358.85 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.050 | iteration 26770/ 143000 | elapsed time per iteration (ms): 63798.8 | learning rate: 5.548E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.305093E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 13:04:25,937] [INFO] [logging.py:60:log_dist] [Rank 0] step=26780, skipped=31, lr=[0.0005547742849452242, 0.0005547742849452242], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26780 loss: 2.3325 iter time (s): 62.984 samples/sec: 16.258 %comms: 0.002859893949480021 %optimizer_step 0.05544854040920677 %forward: 23.096381495285197 %backward: 61.97217196566128 [2025-04-13 13:04:25,938] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20144.03 | forward: 145470.30 | backward_microstep: 390336.30 | backward: 390325.66 | backward_inner_microstep: 390306.83 | backward_inner: 390300.42 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.81 | reduce_tied_grads: 0.30 | comms: 18.01 | reduce_grads: 0.21 | step: 349.24 | _step_clipping: 0.11 | _step_step: 347.52 | _step_zero_grad: 0.48 | _step_check_overflow: 0.54 samples/sec: 16.258 | iteration 26780/ 143000 | elapsed time per iteration (ms): 62984.6 | learning rate: 5.548E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.318389E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 13:15:03,090] [INFO] [logging.py:60:log_dist] [Rank 0] step=26790, skipped=31, lr=[0.0005547394799612169, 0.0005547394799612169], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26790 loss: 2.3055 iter time (s): 63.715 samples/sec: 16.072 %comms: 0.0028429279990439585 %optimizer_step 0.054952809591755136 %forward: 22.85089283749014 %backward: 61.261592710226395 [2025-04-13 13:15:03,090] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27322.03 | forward: 145593.64 | backward_microstep: 390338.85 | backward: 390326.03 | backward_inner_microstep: 390308.23 | backward_inner: 390301.62 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.89 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 350.13 | _step_clipping: 0.12 | _step_step: 348.37 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.072 | iteration 26790/ 143000 | elapsed time per iteration (ms): 63715.2 | learning rate: 5.547E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.313343E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 13:25:41,784] [INFO] [logging.py:60:log_dist] [Rank 0] step=26800, skipped=31, lr=[0.0005547046626823399, 0.0005547046626823399], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26800 loss: 2.3088 iter time (s): 63.869 samples/sec: 16.033 %comms: 0.0031062893189868286 %optimizer_step 0.059711322375890205 %forward: 22.809503112464906 %backward: 61.10871980998473 [2025-04-13 13:25:41,785] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28767.71 | forward: 145681.77 | backward_microstep: 390306.21 | backward: 390294.61 | backward_inner_microstep: 390277.63 | backward_inner: 390271.20 | backward_allreduce_microstep: 8.06 | backward_allreduce: 2.77 | reduce_tied_grads: 0.32 | comms: 19.84 | reduce_grads: 0.22 | step: 381.37 | _step_clipping: 0.12 | _step_step: 379.53 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 16.033 | iteration 26800/ 143000 | elapsed time per iteration (ms): 63869.5 | learning rate: 5.547E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.312860E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 13:36:13,783] [INFO] [logging.py:60:log_dist] [Rank 0] step=26810, skipped=31, lr=[0.0005546698331102741, 0.0005546698331102741], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26810 loss: 2.3060 iter time (s): 63.199 samples/sec: 16.203 %comms: 0.0028965148516503464 %optimizer_step 0.056749357974449195 %forward: 23.003288892265918 %backward: 61.73574929848951 [2025-04-13 13:36:13,784] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22533.81 | forward: 145379.23 | backward_microstep: 390175.99 | backward: 390165.77 | backward_inner_microstep: 390148.36 | backward_inner: 390141.73 | backward_allreduce_microstep: 8.37 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.31 | reduce_grads: 0.23 | step: 358.65 | _step_clipping: 0.13 | _step_step: 356.62 | _step_zero_grad: 0.52 | _step_check_overflow: 0.77 samples/sec: 16.203 | iteration 26810/ 143000 | elapsed time per iteration (ms): 63199.9 | learning rate: 5.547E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.313666E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 13:46:53,048] [INFO] [logging.py:60:log_dist] [Rank 0] step=26820, skipped=31, lr=[0.0005546349912466999, 0.0005546349912466999], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26820 loss: 2.3107 iter time (s): 63.926 samples/sec: 16.019 %comms: 0.0028850056421041706 %optimizer_step 0.05741371578018981 %forward: 22.752936913477868 %backward: 61.0342571053968 [2025-04-13 13:46:53,051] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29732.82 | forward: 145449.98 | backward_microstep: 390175.75 | backward: 390166.40 | backward_inner_microstep: 390148.17 | backward_inner: 390141.42 | backward_allreduce_microstep: 8.85 | backward_allreduce: 3.05 | reduce_tied_grads: 0.35 | comms: 18.44 | reduce_grads: 0.23 | step: 367.02 | _step_clipping: 0.14 | _step_step: 365.13 | _step_zero_grad: 0.56 | _step_check_overflow: 0.54 samples/sec: 16.018 | iteration 26820/ 143000 | elapsed time per iteration (ms): 63926.8 | learning rate: 5.546E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.319638E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 13:57:38,508] [INFO] [logging.py:60:log_dist] [Rank 0] step=26830, skipped=31, lr=[0.0005546001370932993, 0.0005546001370932993], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26830 loss: 2.3029 iter time (s): 64.545 samples/sec: 15.865 %comms: 0.0028195008839748447 %optimizer_step 0.05610252684522496 %forward: 22.56869364375086 %backward: 60.443352835750396 [2025-04-13 13:57:38,509] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35751.84 | forward: 145669.81 | backward_microstep: 390140.46 | backward: 390132.09 | backward_inner_microstep: 390113.03 | backward_inner: 390106.33 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.32 | reduce_tied_grads: 0.31 | comms: 18.20 | reduce_grads: 0.22 | step: 362.11 | _step_clipping: 0.13 | _step_step: 360.18 | _step_zero_grad: 0.60 | _step_check_overflow: 0.57 samples/sec: 15.865 | iteration 26830/ 143000 | elapsed time per iteration (ms): 64545.7 | learning rate: 5.546E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.311162E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 14:02:51,117] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 524288.0 [2025-04-13 14:03:53,476] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 524288.0, reducing to 262144.0 [2025-04-13 14:08:02,862] [INFO] [logging.py:60:log_dist] [Rank 0] step=26840, skipped=33, lr=[0.0005545722449230342, 0.0005545722449230342], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26840 loss: 2.3526 iter time (s): 62.435 samples/sec: 16.401 %comms: 0.002331268390020753 %optimizer_step 0.047143677778592456 %forward: 23.255223245702044 %backward: 62.479949278925595 [2025-04-13 14:08:02,862] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 15231.59 | forward: 145193.40 | backward_microstep: 390100.01 | backward: 390092.00 | backward_inner_microstep: 390075.63 | backward_inner: 390069.50 | backward_allreduce_microstep: 7.91 | backward_allreduce: 2.73 | reduce_tied_grads: 0.30 | comms: 14.56 | reduce_grads: 0.20 | step: 294.34 | _step_clipping: 0.12 | _step_step: 292.52 | _step_zero_grad: 0.51 | _step_check_overflow: 0.61 samples/sec: 16.401 | iteration 26840/ 143000 | elapsed time per iteration (ms): 62435.3 | learning rate: 5.546E-04 | approx flops per GPU: 70.8TFLOPS | lm_loss: 2.323486E+00 | loss scale: 262144.0 | number of skipped iterations: 2 | number of nan iterations: 0 | time (ms) [2025-04-13 14:18:39,742] [INFO] [logging.py:60:log_dist] [Rank 0] step=26850, skipped=33, lr=[0.0005545373686521854, 0.0005545373686521854], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26850 loss: 2.3011 iter time (s): 63.687 samples/sec: 16.079 %comms: 0.002871656036676432 %optimizer_step 0.05933811181330225 %forward: 22.820191261568613 %backward: 61.27443748253006 [2025-04-13 14:18:39,743] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27397.04 | forward: 145336.02 | backward_microstep: 390253.38 | backward: 390241.39 | backward_inner_microstep: 390223.91 | backward_inner: 390217.33 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.88 | reduce_tied_grads: 0.32 | comms: 18.29 | reduce_grads: 0.20 | step: 377.91 | _step_clipping: 0.14 | _step_step: 375.95 | _step_zero_grad: 0.53 | _step_check_overflow: 0.67 samples/sec: 16.078 | iteration 26850/ 143000 | elapsed time per iteration (ms): 63688.1 | learning rate: 5.545E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.326294E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 14:29:11,984] [INFO] [logging.py:60:log_dist] [Rank 0] step=26860, skipped=33, lr=[0.000554502480096222, 0.000554502480096222], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26860 loss: 2.3157 iter time (s): 63.224 samples/sec: 16.196 %comms: 0.002873608941183726 %optimizer_step 0.05677687421674104 %forward: 23.04881781311243 %backward: 61.77757795716751 [2025-04-13 14:29:11,984] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21986.20 | forward: 145722.78 | backward_microstep: 390597.78 | backward: 390579.71 | backward_inner_microstep: 390559.87 | backward_inner: 390552.53 | backward_allreduce_microstep: 9.56 | backward_allreduce: 3.31 | reduce_tied_grads: 0.30 | comms: 18.17 | reduce_grads: 0.21 | step: 358.96 | _step_clipping: 0.11 | _step_step: 357.22 | _step_zero_grad: 0.48 | _step_check_overflow: 0.56 samples/sec: 16.196 | iteration 26860/ 143000 | elapsed time per iteration (ms): 63224.1 | learning rate: 5.545E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.316928E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 14:39:49,492] [INFO] [logging.py:60:log_dist] [Rank 0] step=26870, skipped=33, lr=[0.0005544675792568275, 0.0005544675792568275], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26870 loss: 2.3160 iter time (s): 63.750 samples/sec: 16.063 %comms: 0.0028559282183931705 %optimizer_step 0.05906596154794762 %forward: 22.87165408920826 %backward: 61.24831080844053 [2025-04-13 14:39:49,493] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27287.19 | forward: 145807.23 | backward_microstep: 390473.03 | backward: 390459.14 | backward_inner_microstep: 390440.06 | backward_inner: 390431.34 | backward_allreduce_microstep: 9.11 | backward_allreduce: 3.13 | reduce_tied_grads: 0.32 | comms: 18.21 | reduce_grads: 0.23 | step: 376.55 | _step_clipping: 0.12 | _step_step: 373.54 | _step_zero_grad: 0.64 | _step_check_overflow: 1.60 samples/sec: 16.063 | iteration 26870/ 143000 | elapsed time per iteration (ms): 63750.8 | learning rate: 5.545E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.317756E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 14:50:31,580] [INFO] [logging.py:60:log_dist] [Rank 0] step=26880, skipped=33, lr=[0.0005544326661356868, 0.0005544326661356868], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26880 loss: 2.3383 iter time (s): 64.208 samples/sec: 15.948 %comms: 0.0028556068613878834 %optimizer_step 0.05817682587251704 %forward: 22.689635777934907 %backward: 60.78777164264753 [2025-04-13 14:50:31,581] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32157.05 | forward: 145686.11 | backward_microstep: 390318.98 | backward: 390307.44 | backward_inner_microstep: 390289.24 | backward_inner: 390282.31 | backward_allreduce_microstep: 8.72 | backward_allreduce: 2.99 | reduce_tied_grads: 0.34 | comms: 18.34 | reduce_grads: 0.23 | step: 373.54 | _step_clipping: 0.11 | _step_step: 371.64 | _step_zero_grad: 0.61 | _step_check_overflow: 0.52 samples/sec: 15.948 | iteration 26880/ 143000 | elapsed time per iteration (ms): 64208.8 | learning rate: 5.544E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.311105E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 15:01:03,926] [INFO] [logging.py:60:log_dist] [Rank 0] step=26890, skipped=33, lr=[0.0005543977407344845, 0.0005543977407344845], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26890 loss: 2.2972 iter time (s): 63.234 samples/sec: 16.194 %comms: 0.002879281996858789 %optimizer_step 0.057505179153029946 %forward: 23.003732267733128 %backward: 61.725645220146404 [2025-04-13 15:01:03,926] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22627.71 | forward: 145461.66 | backward_microstep: 390327.61 | backward: 390315.58 | backward_inner_microstep: 390297.81 | backward_inner: 390291.12 | backward_allreduce_microstep: 8.54 | backward_allreduce: 2.93 | reduce_tied_grads: 0.37 | comms: 18.21 | reduce_grads: 0.23 | step: 363.63 | _step_clipping: 0.14 | _step_step: 361.69 | _step_zero_grad: 0.55 | _step_check_overflow: 0.63 samples/sec: 16.194 | iteration 26890/ 143000 | elapsed time per iteration (ms): 63234.5 | learning rate: 5.544E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.303915E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 15:11:38,480] [INFO] [logging.py:60:log_dist] [Rank 0] step=26900, skipped=33, lr=[0.0005543628030549065, 0.0005543628030549065], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26900 loss: 2.3158 iter time (s): 63.455 samples/sec: 16.137 %comms: 0.0028539279187404162 %optimizer_step 0.056981732904521264 %forward: 22.911522924710955 %backward: 61.478080428356066 [2025-04-13 15:11:38,481] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25161.36 | forward: 145384.75 | backward_microstep: 390117.35 | backward: 390108.31 | backward_inner_microstep: 390091.24 | backward_inner: 390084.75 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.80 | reduce_tied_grads: 0.30 | comms: 18.11 | reduce_grads: 0.21 | step: 361.58 | _step_clipping: 0.14 | _step_step: 359.80 | _step_zero_grad: 0.54 | _step_check_overflow: 0.49 samples/sec: 16.137 | iteration 26900/ 143000 | elapsed time per iteration (ms): 63455.4 | learning rate: 5.544E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.317152E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 15:22:10,953] [INFO] [logging.py:60:log_dist] [Rank 0] step=26910, skipped=33, lr=[0.000554327853098639, 0.000554327853098639], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26910 loss: 2.3271 iter time (s): 63.247 samples/sec: 16.191 %comms: 0.0028507282492596384 %optimizer_step 0.05731704231976108 %forward: 22.978497389534844 %backward: 61.69257536137691 [2025-04-13 15:22:10,954] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23058.16 | forward: 145331.52 | backward_microstep: 390196.36 | backward: 390185.46 | backward_inner_microstep: 390167.88 | backward_inner: 390161.29 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.92 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.21 | step: 362.51 | _step_clipping: 0.13 | _step_step: 360.70 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 16.190 | iteration 26910/ 143000 | elapsed time per iteration (ms): 63247.3 | learning rate: 5.543E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.325781E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 15:32:44,719] [INFO] [logging.py:60:log_dist] [Rank 0] step=26920, skipped=33, lr=[0.0005542928908673691, 0.0005542928908673691], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26920 loss: 2.3430 iter time (s): 63.376 samples/sec: 16.158 %comms: 0.0028808837624883813 %optimizer_step 0.05813302837088867 %forward: 22.923931138346667 %backward: 61.55923834852071 [2025-04-13 15:32:44,719] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24431.13 | forward: 145282.45 | backward_microstep: 390150.08 | backward: 390137.13 | backward_inner_microstep: 390118.84 | backward_inner: 390112.09 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.08 | reduce_tied_grads: 0.36 | comms: 18.26 | reduce_grads: 0.27 | step: 368.42 | _step_clipping: 0.14 | _step_step: 366.42 | _step_zero_grad: 0.61 | _step_check_overflow: 0.59 samples/sec: 16.157 | iteration 26920/ 143000 | elapsed time per iteration (ms): 63376.5 | learning rate: 5.543E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.316900E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 15:43:28,813] [INFO] [logging.py:60:log_dist] [Rank 0] step=26930, skipped=33, lr=[0.0005542579163627838, 0.0005542579163627838], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26930 loss: 2.3123 iter time (s): 64.409 samples/sec: 15.898 %comms: 0.002793591403818418 %optimizer_step 0.05700316059799847 %forward: 22.613619471955072 %backward: 60.59723607576769 [2025-04-13 15:43:28,814] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34226.14 | forward: 145651.81 | backward_microstep: 390312.15 | backward: 390300.06 | backward_inner_microstep: 390278.89 | backward_inner: 390271.91 | backward_allreduce_microstep: 11.09 | backward_allreduce: 3.19 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.22 | step: 367.15 | _step_clipping: 0.12 | _step_step: 365.25 | _step_zero_grad: 0.58 | _step_check_overflow: 0.58 samples/sec: 15.898 | iteration 26930/ 143000 | elapsed time per iteration (ms): 64409.5 | learning rate: 5.543E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.323978E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 15:54:10,169] [INFO] [logging.py:60:log_dist] [Rank 0] step=26940, skipped=33, lr=[0.0005542229295865713, 0.0005542229295865713], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26940 loss: 2.3025 iter time (s): 64.135 samples/sec: 15.966 %comms: 0.0028498390129060554 %optimizer_step 0.06065754514242896 %forward: 22.7102338602221 %backward: 60.87505926029281 [2025-04-13 15:54:10,170] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31342.33 | forward: 145651.80 | backward_microstep: 390435.20 | backward: 390421.43 | backward_inner_microstep: 390401.83 | backward_inner: 390394.46 | backward_allreduce_microstep: 9.24 | backward_allreduce: 3.24 | reduce_tied_grads: 0.38 | comms: 18.28 | reduce_grads: 0.25 | step: 389.03 | _step_clipping: 0.15 | _step_step: 386.96 | _step_zero_grad: 0.66 | _step_check_overflow: 0.56 samples/sec: 15.966 | iteration 26940/ 143000 | elapsed time per iteration (ms): 64135.6 | learning rate: 5.542E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.318892E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:04:43,384] [INFO] [logging.py:60:log_dist] [Rank 0] step=26950, skipped=33, lr=[0.0005541879305404203, 0.0005541879305404203], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26950 loss: 2.3185 iter time (s): 63.321 samples/sec: 16.172 %comms: 0.0028370701876600482 %optimizer_step 0.05560283303023646 %forward: 22.971591111819098 %backward: 61.62749365503893 [2025-04-13 16:04:43,385] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23624.46 | forward: 145458.33 | backward_microstep: 390241.90 | backward: 390231.22 | backward_inner_microstep: 390213.87 | backward_inner: 390207.12 | backward_allreduce_microstep: 8.30 | backward_allreduce: 2.86 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.20 | step: 352.08 | _step_clipping: 0.13 | _step_step: 350.33 | _step_zero_grad: 0.48 | _step_check_overflow: 0.57 samples/sec: 16.171 | iteration 26950/ 143000 | elapsed time per iteration (ms): 63321.5 | learning rate: 5.542E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.319480E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:15:16,030] [INFO] [logging.py:60:log_dist] [Rank 0] step=26960, skipped=33, lr=[0.00055415291922602, 0.00055415291922602], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26960 loss: 2.3029 iter time (s): 63.264 samples/sec: 16.186 %comms: 0.0028465591792232927 %optimizer_step 0.05629507655328597 %forward: 23.000951404170902 %backward: 61.67820694235976 [2025-04-13 16:15:16,030] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23044.00 | forward: 145513.21 | backward_microstep: 390212.05 | backward: 390200.98 | backward_inner_microstep: 390183.71 | backward_inner: 390176.88 | backward_allreduce_microstep: 8.35 | backward_allreduce: 2.88 | reduce_tied_grads: 0.33 | comms: 18.01 | reduce_grads: 0.21 | step: 356.15 | _step_clipping: 0.13 | _step_step: 354.39 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 16.186 | iteration 26960/ 143000 | elapsed time per iteration (ms): 63264.6 | learning rate: 5.542E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.310719E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:25:47,548] [INFO] [logging.py:60:log_dist] [Rank 0] step=26970, skipped=33, lr=[0.0005541178956450602, 0.0005541178956450602], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26970 loss: 2.3152 iter time (s): 63.151 samples/sec: 16.215 %comms: 0.002845068402870147 %optimizer_step 0.05578578894626281 %forward: 23.071734748445543 %backward: 61.78096598316516 [2025-04-13 16:25:47,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21803.74 | forward: 145701.08 | backward_microstep: 390163.85 | backward: 390155.02 | backward_inner_microstep: 390138.36 | backward_inner: 390132.11 | backward_allreduce_microstep: 8.02 | backward_allreduce: 2.77 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.21 | step: 352.29 | _step_clipping: 0.12 | _step_step: 350.44 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.215 | iteration 26970/ 143000 | elapsed time per iteration (ms): 63151.9 | learning rate: 5.541E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.307530E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:36:22,865] [INFO] [logging.py:60:log_dist] [Rank 0] step=26980, skipped=33, lr=[0.0005540828597992311, 0.0005540828597992311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26980 loss: 2.3273 iter time (s): 63.531 samples/sec: 16.118 %comms: 0.002825773986683417 %optimizer_step 0.055529697494004604 %forward: 22.894201602259034 %backward: 61.40303795246799 [2025-04-13 16:36:22,866] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25905.78 | forward: 145449.29 | backward_microstep: 390108.51 | backward: 390100.02 | backward_inner_microstep: 390082.58 | backward_inner: 390076.26 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.90 | reduce_tied_grads: 0.27 | comms: 17.95 | reduce_grads: 0.19 | step: 352.79 | _step_clipping: 0.12 | _step_step: 351.00 | _step_zero_grad: 0.53 | _step_check_overflow: 0.53 samples/sec: 16.118 | iteration 26980/ 143000 | elapsed time per iteration (ms): 63531.6 | learning rate: 5.541E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.314600E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:47:01,166] [INFO] [logging.py:60:log_dist] [Rank 0] step=26990, skipped=33, lr=[0.0005540478116902239, 0.0005540478116902239], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 26990 loss: 2.3168 iter time (s): 63.829 samples/sec: 16.043 %comms: 0.002818875275652801 %optimizer_step 0.05611745794697953 %forward: 22.781599141649995 %backward: 61.136363982128216 [2025-04-13 16:47:01,166] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28797.83 | forward: 145413.77 | backward_microstep: 390240.17 | backward: 390230.25 | backward_inner_microstep: 390210.55 | backward_inner: 390204.02 | backward_allreduce_microstep: 8.64 | backward_allreduce: 2.98 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.21 | step: 358.19 | _step_clipping: 0.12 | _step_step: 356.30 | _step_zero_grad: 0.53 | _step_check_overflow: 0.48 samples/sec: 16.043 | iteration 26990/ 143000 | elapsed time per iteration (ms): 63830.0 | learning rate: 5.540E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.320214E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:57:32,975] [INFO] [logging.py:60:log_dist] [Rank 0] step=27000, skipped=33, lr=[0.0005540127513197302, 0.0005540127513197302], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27000 loss: 2.3160 iter time (s): 63.180 samples/sec: 16.208 %comms: 0.002835231200487313 %optimizer_step 0.055027025108479656 %forward: 22.99532018133531 %backward: 61.74636456174631 [2025-04-13 16:57:32,975] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22541.14 | forward: 145285.34 | backward_microstep: 390126.05 | backward: 390115.98 | backward_inner_microstep: 390098.92 | backward_inner: 390092.52 | backward_allreduce_microstep: 8.21 | backward_allreduce: 2.83 | reduce_tied_grads: 0.29 | comms: 17.91 | reduce_grads: 0.19 | step: 347.66 | _step_clipping: 0.14 | _step_step: 345.96 | _step_zero_grad: 0.49 | _step_check_overflow: 0.52 samples/sec: 16.207 | iteration 27000/ 143000 | elapsed time per iteration (ms): 63180.9 | learning rate: 5.540E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.307807E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 16:57:35,833] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step27000/mp_rank_00_model_states.pt [2025-04-13 16:57:50,215] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-13 16:57:50,226] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step27000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-13 17:08:28,271] [INFO] [logging.py:60:log_dist] [Rank 0] step=27010, skipped=33, lr=[0.0005539776786894421, 0.0005539776786894421], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27010 loss: 2.3187 iter time (s): 63.803 samples/sec: 16.050 %comms: 0.0031122863437304115 %optimizer_step 0.056867731049289306 %forward: 22.83278520542589 %backward: 61.15969846554699 [2025-04-13 17:08:28,272] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28235.90 | forward: 145678.90 | backward_microstep: 390225.61 | backward: 390214.23 | backward_inner_microstep: 390196.44 | backward_inner: 390189.76 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.88 | reduce_tied_grads: 0.31 | comms: 19.86 | reduce_grads: 0.22 | step: 362.83 | _step_clipping: 0.12 | _step_step: 361.07 | _step_zero_grad: 0.52 | _step_check_overflow: 0.51 samples/sec: 15.627 | iteration 27010/ 143000 | elapsed time per iteration (ms): 65529.6 | learning rate: 5.540E-04 | approx flops per GPU: 67.4TFLOPS | lm_loss: 2.309409E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 17:18:53,580] [INFO] [logging.py:60:log_dist] [Rank 0] step=27020, skipped=33, lr=[0.0005539425938010523, 0.0005539425938010523], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27020 loss: 2.3209 iter time (s): 62.530 samples/sec: 16.376 %comms: 0.0029499992457236645 %optimizer_step 0.05695008431413929 %forward: 23.220125075583244 %backward: 62.37132362659028 [2025-04-13 17:18:53,581] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 16226.42 | forward: 145196.23 | backward_microstep: 390018.02 | backward: 390010.00 | backward_inner_microstep: 389990.92 | backward_inner: 389984.63 | backward_allreduce_microstep: 10.46 | backward_allreduce: 5.08 | reduce_tied_grads: 0.36 | comms: 18.45 | reduce_grads: 0.24 | step: 356.11 | _step_clipping: 0.14 | _step_step: 354.30 | _step_zero_grad: 0.50 | _step_check_overflow: 0.54 samples/sec: 16.376 | iteration 27020/ 143000 | elapsed time per iteration (ms): 62530.9 | learning rate: 5.539E-04 | approx flops per GPU: 70.6TFLOPS | lm_loss: 2.314883E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 17:29:23,484] [INFO] [logging.py:60:log_dist] [Rank 0] step=27030, skipped=33, lr=[0.0005539074966562542, 0.0005539074966562542], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27030 loss: 2.3281 iter time (s): 62.990 samples/sec: 16.257 %comms: 0.002874126638980377 %optimizer_step 0.058313801634735506 %forward: 23.074801117781437 %backward: 61.92426847389759 [2025-04-13 17:29:23,485] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20624.39 | forward: 145347.79 | backward_microstep: 390068.30 | backward: 390059.95 | backward_inner_microstep: 390041.84 | backward_inner: 390035.33 | backward_allreduce_microstep: 9.01 | backward_allreduce: 3.03 | reduce_tied_grads: 0.35 | comms: 18.10 | reduce_grads: 0.24 | step: 367.32 | _step_clipping: 0.13 | _step_step: 365.43 | _step_zero_grad: 0.58 | _step_check_overflow: 0.54 samples/sec: 16.256 | iteration 27030/ 143000 | elapsed time per iteration (ms): 62990.4 | learning rate: 5.539E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.321169E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 17:39:53,487] [INFO] [logging.py:60:log_dist] [Rank 0] step=27040, skipped=33, lr=[0.0005538723872567418, 0.0005538723872567418], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27040 loss: 2.3095 iter time (s): 63.000 samples/sec: 16.254 %comms: 0.002848585016757223 %optimizer_step 0.05655450378900143 %forward: 23.046230616489883 %backward: 61.918930323253996 [2025-04-13 17:39:53,488] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20909.66 | forward: 145190.60 | backward_microstep: 390096.24 | backward: 390087.51 | backward_inner_microstep: 390070.57 | backward_inner: 390064.28 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.95 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.20 | step: 356.29 | _step_clipping: 0.12 | _step_step: 354.53 | _step_zero_grad: 0.51 | _step_check_overflow: 0.55 samples/sec: 16.254 | iteration 27040/ 143000 | elapsed time per iteration (ms): 63000.3 | learning rate: 5.539E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.324210E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 17:50:27,827] [INFO] [logging.py:60:log_dist] [Rank 0] step=27050, skipped=33, lr=[0.0005538372656042095, 0.0005538372656042095], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27050 loss: 2.3116 iter time (s): 63.433 samples/sec: 16.143 %comms: 0.0028250874689010916 %optimizer_step 0.05579833401986522 %forward: 22.90909418327901 %backward: 61.49140147095814 [2025-04-13 17:50:27,827] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25128.29 | forward: 145320.20 | backward_microstep: 390068.97 | backward: 390060.95 | backward_inner_microstep: 390043.12 | backward_inner: 390036.75 | backward_allreduce_microstep: 8.74 | backward_allreduce: 3.02 | reduce_tied_grads: 0.26 | comms: 17.92 | reduce_grads: 0.19 | step: 353.95 | _step_clipping: 0.11 | _step_step: 352.31 | _step_zero_grad: 0.52 | _step_check_overflow: 0.44 samples/sec: 16.143 | iteration 27050/ 143000 | elapsed time per iteration (ms): 63434.0 | learning rate: 5.538E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.311309E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 18:00:58,192] [INFO] [logging.py:60:log_dist] [Rank 0] step=27060, skipped=33, lr=[0.0005538021317003526, 0.0005538021317003526], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27060 loss: 2.3565 iter time (s): 63.036 samples/sec: 16.245 %comms: 0.002849404340884376 %optimizer_step 0.05698139221526191 %forward: 23.055858511845578 %backward: 61.888788939987585 [2025-04-13 18:00:58,193] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21069.29 | forward: 145334.88 | backward_microstep: 390131.10 | backward: 390122.10 | backward_inner_microstep: 390105.10 | backward_inner: 390098.75 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.83 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.21 | step: 359.19 | _step_clipping: 0.13 | _step_step: 357.43 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.245 | iteration 27060/ 143000 | elapsed time per iteration (ms): 63036.5 | learning rate: 5.538E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.330552E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 18:11:38,842] [INFO] [logging.py:60:log_dist] [Rank 0] step=27070, skipped=33, lr=[0.0005537669855468667, 0.0005537669855468667], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27070 loss: 2.3387 iter time (s): 64.064 samples/sec: 15.984 %comms: 0.0027981521357288612 %optimizer_step 0.055707521719685564 %forward: 22.719282242433405 %backward: 60.88591853841151 [2025-04-13 18:11:38,843] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31267.02 | forward: 145549.90 | backward_microstep: 390070.64 | backward: 390062.47 | backward_inner_microstep: 390044.65 | backward_inner: 390038.06 | backward_allreduce_microstep: 8.76 | backward_allreduce: 3.02 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.20 | step: 356.89 | _step_clipping: 0.12 | _step_step: 355.17 | _step_zero_grad: 0.49 | _step_check_overflow: 0.54 samples/sec: 15.984 | iteration 27070/ 143000 | elapsed time per iteration (ms): 64065.0 | learning rate: 5.538E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.329045E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 18:22:20,408] [INFO] [logging.py:60:log_dist] [Rank 0] step=27080, skipped=33, lr=[0.0005537318271454482, 0.0005537318271454482], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27080 loss: 2.2931 iter time (s): 64.156 samples/sec: 15.961 %comms: 0.0028360436775149425 %optimizer_step 0.05760816148641032 %forward: 22.664240428915967 %backward: 60.81926310493121 [2025-04-13 18:22:20,409] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32138.50 | forward: 145404.64 | backward_microstep: 390201.71 | backward: 390191.89 | backward_inner_microstep: 390171.63 | backward_inner: 390164.77 | backward_allreduce_microstep: 9.17 | backward_allreduce: 3.07 | reduce_tied_grads: 0.34 | comms: 18.19 | reduce_grads: 0.23 | step: 369.59 | _step_clipping: 0.14 | _step_step: 367.52 | _step_zero_grad: 0.67 | _step_check_overflow: 0.61 samples/sec: 15.961 | iteration 27080/ 143000 | elapsed time per iteration (ms): 64156.6 | learning rate: 5.537E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.309767E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 18:32:52,395] [INFO] [logging.py:60:log_dist] [Rank 0] step=27090, skipped=33, lr=[0.000553696656497794, 0.000553696656497794], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27090 loss: 2.3149 iter time (s): 63.198 samples/sec: 16.203 %comms: 0.0028431160891263818 %optimizer_step 0.055923368763907536 %forward: 22.9873593729981 %backward: 61.73087327556943 [2025-04-13 18:32:52,396] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22756.64 | forward: 145275.63 | backward_microstep: 390136.34 | backward: 390127.08 | backward_inner_microstep: 390110.18 | backward_inner: 390103.79 | backward_allreduce_microstep: 8.12 | backward_allreduce: 2.80 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.20 | step: 353.42 | _step_clipping: 0.11 | _step_step: 351.73 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.203 | iteration 27090/ 143000 | elapsed time per iteration (ms): 63198.6 | learning rate: 5.537E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.304744E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 18:43:27,689] [INFO] [logging.py:60:log_dist] [Rank 0] step=27100, skipped=33, lr=[0.0005536614736056013, 0.0005536614736056013], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27100 loss: 2.3209 iter time (s): 63.529 samples/sec: 16.119 %comms: 0.0028291372436784253 %optimizer_step 0.05555572133028398 %forward: 22.8851620872661 %backward: 61.41328544662991 [2025-04-13 18:43:27,690] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25894.08 | forward: 145386.81 | backward_microstep: 390161.55 | backward: 390151.56 | backward_inner_microstep: 390133.60 | backward_inner: 390125.22 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.01 | reduce_tied_grads: 0.31 | comms: 17.97 | reduce_grads: 0.23 | step: 352.94 | _step_clipping: 0.12 | _step_step: 351.22 | _step_zero_grad: 0.49 | _step_check_overflow: 0.53 samples/sec: 16.119 | iteration 27100/ 143000 | elapsed time per iteration (ms): 63529.4 | learning rate: 5.537E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.325029E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 18:54:03,467] [INFO] [logging.py:60:log_dist] [Rank 0] step=27110, skipped=33, lr=[0.0005536262784705687, 0.0005536262784705687], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27110 loss: 2.3122 iter time (s): 63.577 samples/sec: 16.106 %comms: 0.002841275137851401 %optimizer_step 0.057792982329414426 %forward: 22.86944852250599 %backward: 61.37024549219452 [2025-04-13 18:54:03,467] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26318.18 | forward: 145397.47 | backward_microstep: 390185.58 | backward: 390174.63 | backward_inner_microstep: 390156.02 | backward_inner: 390149.15 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.12 | reduce_tied_grads: 0.33 | comms: 18.06 | reduce_grads: 0.24 | step: 367.43 | _step_clipping: 0.12 | _step_step: 365.58 | _step_zero_grad: 0.54 | _step_check_overflow: 0.57 samples/sec: 16.106 | iteration 27110/ 143000 | elapsed time per iteration (ms): 63577.8 | learning rate: 5.536E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.312580E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 19:04:53,312] [INFO] [logging.py:60:log_dist] [Rank 0] step=27120, skipped=33, lr=[0.0005535910710943946, 0.0005535910710943946], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27120 loss: 2.3111 iter time (s): 64.984 samples/sec: 15.758 %comms: 0.0027993951210871955 %optimizer_step 0.05603258942791007 %forward: 22.412002356629806 %backward: 60.02581156697443 [2025-04-13 19:04:53,313] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40229.84 | forward: 145642.05 | backward_microstep: 390080.92 | backward: 390071.45 | backward_inner_microstep: 390053.09 | backward_inner: 390046.43 | backward_allreduce_microstep: 9.09 | backward_allreduce: 3.06 | reduce_tied_grads: 0.35 | comms: 18.19 | reduce_grads: 0.23 | step: 364.12 | _step_clipping: 0.14 | _step_step: 362.14 | _step_zero_grad: 0.55 | _step_check_overflow: 0.64 samples/sec: 15.758 | iteration 27120/ 143000 | elapsed time per iteration (ms): 64984.5 | learning rate: 5.536E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.314299E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 19:15:34,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=27130, skipped=33, lr=[0.0005535558514787782, 0.0005535558514787782], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27130 loss: 2.2961 iter time (s): 64.150 samples/sec: 15.963 %comms: 0.002822939941914269 %optimizer_step 0.05600845898868792 %forward: 22.66898271801854 %backward: 60.81324927801178 [2025-04-13 19:15:34,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32078.97 | forward: 145420.91 | backward_microstep: 390125.55 | backward: 390115.35 | backward_inner_microstep: 390093.89 | backward_inner: 390087.14 | backward_allreduce_microstep: 10.48 | backward_allreduce: 4.71 | reduce_tied_grads: 0.31 | comms: 18.11 | reduce_grads: 0.20 | step: 359.29 | _step_clipping: 0.12 | _step_step: 357.42 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 15.963 | iteration 27130/ 143000 | elapsed time per iteration (ms): 64150.3 | learning rate: 5.536E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.303215E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 19:26:10,941] [INFO] [logging.py:60:log_dist] [Rank 0] step=27140, skipped=33, lr=[0.0005535206196254193, 0.0005535206196254193], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27140 loss: 2.3188 iter time (s): 63.612 samples/sec: 16.098 %comms: 0.0028360844177998325 %optimizer_step 0.057451576526214757 %forward: 22.8767118733578 %backward: 61.330312604274425 [2025-04-13 19:26:10,942] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26559.39 | forward: 145523.30 | backward_microstep: 390144.84 | backward: 390134.28 | backward_inner_microstep: 390115.55 | backward_inner: 390108.56 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.13 | reduce_tied_grads: 0.33 | comms: 18.04 | reduce_grads: 0.23 | step: 365.46 | _step_clipping: 0.13 | _step_step: 363.57 | _step_zero_grad: 0.56 | _step_check_overflow: 0.56 samples/sec: 16.097 | iteration 27140/ 143000 | elapsed time per iteration (ms): 63612.6 | learning rate: 5.535E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.314119E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 19:36:41,712] [INFO] [logging.py:60:log_dist] [Rank 0] step=27150, skipped=33, lr=[0.0005534853755360186, 0.0005534853755360186], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27150 loss: 2.3055 iter time (s): 63.076 samples/sec: 16.234 %comms: 0.002854417057028998 %optimizer_step 0.056574661733227644 %forward: 23.09649053775317 %backward: 61.874017687354076 [2025-04-13 19:36:41,713] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20904.60 | forward: 145684.51 | backward_microstep: 390290.51 | backward: 390279.48 | backward_inner_microstep: 390262.23 | backward_inner: 390255.55 | backward_allreduce_microstep: 8.22 | backward_allreduce: 2.83 | reduce_tied_grads: 0.31 | comms: 18.00 | reduce_grads: 0.21 | step: 356.85 | _step_clipping: 0.11 | _step_step: 355.00 | _step_zero_grad: 0.56 | _step_check_overflow: 0.56 samples/sec: 16.234 | iteration 27150/ 143000 | elapsed time per iteration (ms): 63077.1 | learning rate: 5.535E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.308645E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 19:47:17,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=27160, skipped=33, lr=[0.000553450119212277, 0.000553450119212277], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27160 loss: 2.2998 iter time (s): 63.610 samples/sec: 16.098 %comms: 0.0028357674600862385 %optimizer_step 0.05600623867042112 %forward: 22.870109429368224 %backward: 61.34024376182574 [2025-04-13 19:47:17,816] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26529.08 | forward: 145476.41 | backward_microstep: 390193.93 | backward: 390184.34 | backward_inner_microstep: 390165.67 | backward_inner: 390159.30 | backward_allreduce_microstep: 8.09 | backward_allreduce: 2.78 | reduce_tied_grads: 0.35 | comms: 18.04 | reduce_grads: 0.22 | step: 356.25 | _step_clipping: 0.13 | _step_step: 354.46 | _step_zero_grad: 0.49 | _step_check_overflow: 0.58 samples/sec: 16.098 | iteration 27160/ 143000 | elapsed time per iteration (ms): 63610.4 | learning rate: 5.535E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.307731E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 19:57:59,002] [INFO] [logging.py:60:log_dist] [Rank 0] step=27170, skipped=33, lr=[0.0005534148506558962, 0.0005534148506558962], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27170 loss: 2.3214 iter time (s): 64.118 samples/sec: 15.971 %comms: 0.0028235923133070736 %optimizer_step 0.055355237935644355 %forward: 22.68858160076851 %backward: 60.847995879285776 [2025-04-13 19:57:59,002] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31656.08 | forward: 145474.69 | backward_microstep: 390154.66 | backward: 390145.30 | backward_inner_microstep: 390127.23 | backward_inner: 390120.61 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.05 | reduce_tied_grads: 0.28 | comms: 18.10 | reduce_grads: 0.21 | step: 354.93 | _step_clipping: 0.11 | _step_step: 353.04 | _step_zero_grad: 0.66 | _step_check_overflow: 0.54 samples/sec: 15.970 | iteration 27170/ 143000 | elapsed time per iteration (ms): 64118.6 | learning rate: 5.534E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.308539E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 20:08:38,929] [INFO] [logging.py:60:log_dist] [Rank 0] step=27180, skipped=33, lr=[0.0005533795698685782, 0.0005533795698685782], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27180 loss: 2.3012 iter time (s): 63.992 samples/sec: 16.002 %comms: 0.002871435088514803 %optimizer_step 0.05735590062636252 %forward: 22.761829657857362 %backward: 60.96344570398418 [2025-04-13 20:08:38,929] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30225.88 | forward: 145657.76 | backward_microstep: 390128.19 | backward: 390117.97 | backward_inner_microstep: 390100.17 | backward_inner: 390093.40 | backward_allreduce_microstep: 8.56 | backward_allreduce: 2.95 | reduce_tied_grads: 0.36 | comms: 18.37 | reduce_grads: 0.23 | step: 367.03 | _step_clipping: 0.14 | _step_step: 365.11 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 16.002 | iteration 27180/ 143000 | elapsed time per iteration (ms): 63992.7 | learning rate: 5.534E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.304928E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 20:19:21,170] [INFO] [logging.py:60:log_dist] [Rank 0] step=27190, skipped=33, lr=[0.0005533442768520261, 0.0005533442768520261], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27190 loss: 2.3096 iter time (s): 64.224 samples/sec: 15.944 %comms: 0.002819917221788612 %optimizer_step 0.056332933315002484 %forward: 22.65282853051511 %backward: 60.74858311586604 [2025-04-13 20:19:21,171] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32677.02 | forward: 145484.54 | backward_microstep: 390159.72 | backward: 390149.06 | backward_inner_microstep: 390130.41 | backward_inner: 390123.58 | backward_allreduce_microstep: 9.04 | backward_allreduce: 3.12 | reduce_tied_grads: 0.37 | comms: 18.11 | reduce_grads: 0.26 | step: 361.79 | _step_clipping: 0.13 | _step_step: 359.90 | _step_zero_grad: 0.55 | _step_check_overflow: 0.59 samples/sec: 15.944 | iteration 27190/ 143000 | elapsed time per iteration (ms): 64224.2 | learning rate: 5.533E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.311393E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 20:29:54,048] [INFO] [logging.py:60:log_dist] [Rank 0] step=27200, skipped=33, lr=[0.0005533089716079432, 0.0005533089716079432], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27200 loss: 2.3302 iter time (s): 63.287 samples/sec: 16.180 %comms: 0.0028568582358371847 %optimizer_step 0.05686106278343607 %forward: 22.96659264932267 %backward: 61.6540691188283 [2025-04-13 20:29:54,048] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23414.23 | forward: 145348.96 | backward_microstep: 390201.84 | backward: 390190.88 | backward_inner_microstep: 390169.62 | backward_inner: 390162.55 | backward_allreduce_microstep: 9.59 | backward_allreduce: 3.30 | reduce_tied_grads: 0.32 | comms: 18.08 | reduce_grads: 0.23 | step: 359.86 | _step_clipping: 0.14 | _step_step: 358.08 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 16.180 | iteration 27200/ 143000 | elapsed time per iteration (ms): 63287.7 | learning rate: 5.533E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.312970E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 20:40:24,978] [INFO] [logging.py:60:log_dist] [Rank 0] step=27210, skipped=33, lr=[0.0005532736541380333, 0.0005532736541380333], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27210 loss: 2.3405 iter time (s): 63.092 samples/sec: 16.230 %comms: 0.0028634044415629936 %optimizer_step 0.0566910546794048 %forward: 23.041598444911475 %backward: 61.81697348867415 [2025-04-13 20:40:24,979] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21673.24 | forward: 145375.16 | backward_microstep: 390026.82 | backward: 390018.63 | backward_inner_microstep: 390001.13 | backward_inner: 389994.76 | backward_allreduce_microstep: 8.63 | backward_allreduce: 2.98 | reduce_tied_grads: 0.30 | comms: 18.07 | reduce_grads: 0.22 | step: 357.68 | _step_clipping: 0.13 | _step_step: 355.81 | _step_zero_grad: 0.55 | _step_check_overflow: 0.57 samples/sec: 16.230 | iteration 27210/ 143000 | elapsed time per iteration (ms): 63093.1 | learning rate: 5.533E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.328488E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 20:51:06,061] [INFO] [logging.py:60:log_dist] [Rank 0] step=27220, skipped=33, lr=[0.0005532383244440012, 0.0005532383244440012], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27220 loss: 2.3002 iter time (s): 64.108 samples/sec: 15.973 %comms: 0.0028716552182930003 %optimizer_step 0.058129642999330315 %forward: 22.704609864607434 %backward: 60.88544371172432 [2025-04-13 20:51:06,061] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31327.73 | forward: 145553.79 | backward_microstep: 390332.94 | backward: 390321.93 | backward_inner_microstep: 390302.89 | backward_inner: 390295.96 | backward_allreduce_microstep: 9.34 | backward_allreduce: 3.23 | reduce_tied_grads: 0.39 | comms: 18.41 | reduce_grads: 0.26 | step: 372.66 | _step_clipping: 0.14 | _step_step: 370.63 | _step_zero_grad: 0.63 | _step_check_overflow: 0.58 samples/sec: 15.973 | iteration 27220/ 143000 | elapsed time per iteration (ms): 64108.3 | learning rate: 5.532E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.315845E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 21:01:56,480] [INFO] [logging.py:60:log_dist] [Rank 0] step=27230, skipped=33, lr=[0.0005532029825275521, 0.0005532029825275521], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27230 loss: 2.3109 iter time (s): 65.041 samples/sec: 15.744 %comms: 0.0027718885237358448 %optimizer_step 0.055780545073165065 %forward: 22.39260349292719 %backward: 60.00326200011512 [2025-04-13 21:01:56,480] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 40623.65 | forward: 145644.51 | backward_microstep: 390279.96 | backward: 390269.29 | backward_inner_microstep: 390251.21 | backward_inner: 390244.45 | backward_allreduce_microstep: 8.68 | backward_allreduce: 3.00 | reduce_tied_grads: 0.31 | comms: 18.03 | reduce_grads: 0.20 | step: 362.80 | _step_clipping: 0.11 | _step_step: 361.06 | _step_zero_grad: 0.52 | _step_check_overflow: 0.53 samples/sec: 15.744 | iteration 27230/ 143000 | elapsed time per iteration (ms): 65041.9 | learning rate: 5.532E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.315447E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 21:12:39,911] [INFO] [logging.py:60:log_dist] [Rank 0] step=27240, skipped=33, lr=[0.0005531676283903916, 0.0005531676283903916], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27240 loss: 2.3232 iter time (s): 64.343 samples/sec: 15.915 %comms: 0.0027969528783954836 %optimizer_step 0.0544992652857373 %forward: 22.63118797292328 %backward: 60.633495005129504 [2025-04-13 21:12:39,912] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 33840.56 | forward: 145614.86 | backward_microstep: 390139.92 | backward: 390131.44 | backward_inner_microstep: 390114.08 | backward_inner: 390107.63 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.91 | reduce_tied_grads: 0.33 | comms: 18.00 | reduce_grads: 0.21 | step: 350.66 | _step_clipping: 0.12 | _step_step: 348.81 | _step_zero_grad: 0.47 | _step_check_overflow: 0.68 samples/sec: 15.915 | iteration 27240/ 143000 | elapsed time per iteration (ms): 64343.1 | learning rate: 5.532E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.310723E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 21:23:15,985] [INFO] [logging.py:60:log_dist] [Rank 0] step=27250, skipped=33, lr=[0.0005531322620342261, 0.0005531322620342261], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27250 loss: 2.2956 iter time (s): 63.607 samples/sec: 16.099 %comms: 0.002834552873575167 %optimizer_step 0.05750399263219862 %forward: 22.844266549149207 %backward: 61.32673532597038 [2025-04-13 21:23:15,986] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26836.57 | forward: 145305.12 | backward_microstep: 390090.09 | backward: 390079.87 | backward_inner_microstep: 390061.94 | backward_inner: 390055.36 | backward_allreduce_microstep: 8.83 | backward_allreduce: 3.05 | reduce_tied_grads: 0.34 | comms: 18.03 | reduce_grads: 0.23 | step: 365.76 | _step_clipping: 0.16 | _step_step: 363.94 | _step_zero_grad: 0.54 | _step_check_overflow: 0.52 samples/sec: 16.099 | iteration 27250/ 143000 | elapsed time per iteration (ms): 63607.4 | learning rate: 5.531E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.310535E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 21:33:42,261] [INFO] [logging.py:60:log_dist] [Rank 0] step=27260, skipped=33, lr=[0.0005530968834607625, 0.0005530968834607625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27260 loss: 2.3249 iter time (s): 62.627 samples/sec: 16.351 %comms: 0.0028728103805281033 %optimizer_step 0.05560739412403949 %forward: 23.196366788362834 %backward: 62.2858813091436 [2025-04-13 21:33:42,261] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17081.12 | forward: 145271.83 | backward_microstep: 390085.40 | backward: 390077.64 | backward_inner_microstep: 390061.40 | backward_inner: 390055.22 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.70 | reduce_tied_grads: 0.30 | comms: 17.99 | reduce_grads: 0.20 | step: 348.25 | _step_clipping: 0.11 | _step_step: 346.54 | _step_zero_grad: 0.50 | _step_check_overflow: 0.52 samples/sec: 16.351 | iteration 27260/ 143000 | elapsed time per iteration (ms): 62627.5 | learning rate: 5.531E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.323539E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 21:44:18,401] [INFO] [logging.py:60:log_dist] [Rank 0] step=27270, skipped=33, lr=[0.0005530614926717085, 0.0005530614926717085], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27270 loss: 2.3156 iter time (s): 63.613 samples/sec: 16.097 %comms: 0.002837441018405289 %optimizer_step 0.05642979705521971 %forward: 22.868152619165265 %backward: 61.34661085124434 [2025-04-13 21:44:18,402] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26517.25 | forward: 145472.32 | backward_microstep: 390256.87 | backward: 390247.26 | backward_inner_microstep: 390229.60 | backward_inner: 390222.85 | backward_allreduce_microstep: 8.53 | backward_allreduce: 2.93 | reduce_tied_grads: 0.37 | comms: 18.05 | reduce_grads: 0.22 | step: 358.97 | _step_clipping: 0.15 | _step_step: 357.16 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 16.097 | iteration 27270/ 143000 | elapsed time per iteration (ms): 63614.1 | learning rate: 5.531E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.313485E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 21:54:53,598] [INFO] [logging.py:60:log_dist] [Rank 0] step=27280, skipped=33, lr=[0.0005530260896687722, 0.0005530260896687722], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27280 loss: 2.3042 iter time (s): 63.519 samples/sec: 16.121 %comms: 0.0028373056786427627 %optimizer_step 0.05710517306567711 %forward: 22.92231744644792 %backward: 61.44461359135174 [2025-04-13 21:54:53,598] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25383.65 | forward: 145600.40 | backward_microstep: 390301.10 | backward: 390290.41 | backward_inner_microstep: 390268.96 | backward_inner: 390260.52 | backward_allreduce_microstep: 10.30 | backward_allreduce: 3.00 | reduce_tied_grads: 0.33 | comms: 18.02 | reduce_grads: 0.21 | step: 362.73 | _step_clipping: 0.13 | _step_step: 360.91 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.121 | iteration 27280/ 143000 | elapsed time per iteration (ms): 63519.7 | learning rate: 5.530E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.312799E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 22:05:51,006] [INFO] [logging.py:60:log_dist] [Rank 0] step=27290, skipped=33, lr=[0.000552990674453662, 0.000552990674453662], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27290 loss: 2.3194 iter time (s): 65.740 samples/sec: 15.576 %comms: 0.0027929070863068955 %optimizer_step 0.05765639526421662 %forward: 22.192197191631937 %backward: 59.377180646255965 [2025-04-13 22:05:51,007] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 47250.88 | forward: 145891.85 | backward_microstep: 390359.13 | backward: 390346.51 | backward_inner_microstep: 390326.88 | backward_inner: 390319.62 | backward_allreduce_microstep: 9.29 | backward_allreduce: 3.20 | reduce_tied_grads: 0.34 | comms: 18.36 | reduce_grads: 0.23 | step: 379.03 | _step_clipping: 0.13 | _step_step: 376.73 | _step_zero_grad: 0.65 | _step_check_overflow: 0.82 samples/sec: 15.576 | iteration 27290/ 143000 | elapsed time per iteration (ms): 65740.8 | learning rate: 5.530E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.308403E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 22:16:33,689] [INFO] [logging.py:60:log_dist] [Rank 0] step=27300, skipped=33, lr=[0.0005529552470280875, 0.0005529552470280875], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27300 loss: 2.3079 iter time (s): 64.268 samples/sec: 15.933 %comms: 0.002802916456776247 %optimizer_step 0.055755871616206605 %forward: 22.6585704846799 %backward: 60.72833993545827 [2025-04-13 22:16:33,690] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32887.70 | forward: 145621.55 | backward_microstep: 390297.96 | backward: 390287.43 | backward_inner_microstep: 390267.86 | backward_inner: 390259.21 | backward_allreduce_microstep: 9.60 | backward_allreduce: 3.30 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.21 | step: 358.33 | _step_clipping: 0.12 | _step_step: 356.63 | _step_zero_grad: 0.50 | _step_check_overflow: 0.49 samples/sec: 15.933 | iteration 27300/ 143000 | elapsed time per iteration (ms): 64268.4 | learning rate: 5.530E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.310930E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 22:27:11,695] [INFO] [logging.py:60:log_dist] [Rank 0] step=27310, skipped=33, lr=[0.0005529198073937584, 0.0005529198073937584], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27310 loss: 2.3330 iter time (s): 63.800 samples/sec: 16.050 %comms: 0.0028199168940969254 %optimizer_step 0.05628690146293245 %forward: 22.839967818627084 %backward: 61.16947923032664 [2025-04-13 22:27:11,696] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 28142.90 | forward: 145718.96 | backward_microstep: 390271.57 | backward: 390261.19 | backward_inner_microstep: 390243.93 | backward_inner: 390235.62 | backward_allreduce_microstep: 8.28 | backward_allreduce: 2.86 | reduce_tied_grads: 0.31 | comms: 17.99 | reduce_grads: 0.20 | step: 359.11 | _step_clipping: 0.15 | _step_step: 357.31 | _step_zero_grad: 0.52 | _step_check_overflow: 0.56 samples/sec: 16.050 | iteration 27310/ 143000 | elapsed time per iteration (ms): 63800.6 | learning rate: 5.529E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.318129E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 22:37:44,952] [INFO] [logging.py:60:log_dist] [Rank 0] step=27320, skipped=33, lr=[0.0005528843555523854, 0.0005528843555523854], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27320 loss: 2.3185 iter time (s): 63.325 samples/sec: 16.171 %comms: 0.0028596261075290725 %optimizer_step 0.05758543695392264 %forward: 22.962907129492194 %backward: 61.605404179421505 [2025-04-13 22:37:44,953] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23847.59 | forward: 145412.80 | backward_microstep: 390126.02 | backward: 390116.74 | backward_inner_microstep: 390097.23 | backward_inner: 390090.69 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.11 | reduce_grads: 0.22 | step: 364.66 | _step_clipping: 0.12 | _step_step: 362.78 | _step_zero_grad: 0.51 | _step_check_overflow: 0.62 samples/sec: 16.170 | iteration 27320/ 143000 | elapsed time per iteration (ms): 63325.7 | learning rate: 5.529E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.310715E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 22:48:27,520] [INFO] [logging.py:60:log_dist] [Rank 0] step=27330, skipped=33, lr=[0.0005528488915056794, 0.0005528488915056794], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27330 loss: 2.3224 iter time (s): 64.256 samples/sec: 15.936 %comms: 0.0028180410357073506 %optimizer_step 0.05696443487060578 %forward: 22.67028283541999 %backward: 60.74210594441396 [2025-04-13 22:48:27,521] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32697.17 | forward: 145670.56 | backward_microstep: 390317.38 | backward: 390305.51 | backward_inner_microstep: 390283.39 | backward_inner: 390276.46 | backward_allreduce_microstep: 12.44 | backward_allreduce: 3.13 | reduce_tied_grads: 0.35 | comms: 18.11 | reduce_grads: 0.24 | step: 366.03 | _step_clipping: 0.15 | _step_step: 364.03 | _step_zero_grad: 0.57 | _step_check_overflow: 0.63 samples/sec: 15.936 | iteration 27330/ 143000 | elapsed time per iteration (ms): 64256.8 | learning rate: 5.528E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.327162E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 22:59:00,837] [INFO] [logging.py:60:log_dist] [Rank 0] step=27340, skipped=33, lr=[0.000552813415255352, 0.000552813415255352], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27340 loss: 2.3118 iter time (s): 63.331 samples/sec: 16.169 %comms: 0.002844408477411991 %optimizer_step 0.05589091743634078 %forward: 22.962587234708487 %backward: 61.59489823257347 [2025-04-13 22:59:00,838] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23997.77 | forward: 145424.61 | backward_microstep: 390096.27 | backward: 390087.32 | backward_inner_microstep: 390069.90 | backward_inner: 390063.47 | backward_allreduce_microstep: 8.52 | backward_allreduce: 2.87 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.21 | step: 353.96 | _step_clipping: 0.13 | _step_step: 352.29 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.169 | iteration 27340/ 143000 | elapsed time per iteration (ms): 63331.7 | learning rate: 5.528E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.321638E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 23:09:40,776] [INFO] [logging.py:60:log_dist] [Rank 0] step=27350, skipped=33, lr=[0.0005527779268031155, 0.0005527779268031155], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27350 loss: 2.3315 iter time (s): 63.993 samples/sec: 16.002 %comms: 0.00283665976218559 %optimizer_step 0.058622525843414874 %forward: 22.731661265680835 %backward: 60.97755939094962 [2025-04-13 23:09:40,777] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30385.00 | forward: 145467.34 | backward_microstep: 390226.24 | backward: 390215.35 | backward_inner_microstep: 390197.49 | backward_inner: 390190.70 | backward_allreduce_microstep: 8.49 | backward_allreduce: 2.93 | reduce_tied_grads: 0.36 | comms: 18.15 | reduce_grads: 0.23 | step: 375.14 | _step_clipping: 0.14 | _step_step: 373.00 | _step_zero_grad: 0.62 | _step_check_overflow: 0.73 samples/sec: 16.002 | iteration 27350/ 143000 | elapsed time per iteration (ms): 63993.9 | learning rate: 5.528E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.320312E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 23:20:30,301] [INFO] [logging.py:60:log_dist] [Rank 0] step=27360, skipped=33, lr=[0.0005527424261506829, 0.0005527424261506829], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27360 loss: 2.3141 iter time (s): 64.952 samples/sec: 15.766 %comms: 0.002782352375981826 %optimizer_step 0.05705663225809056 %forward: 22.422929014461133 %backward: 60.091057522290036 [2025-04-13 23:20:30,302] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39692.59 | forward: 145641.04 | backward_microstep: 390314.56 | backward: 390302.46 | backward_inner_microstep: 390282.08 | backward_inner: 390274.46 | backward_allreduce_microstep: 9.86 | backward_allreduce: 3.37 | reduce_tied_grads: 0.35 | comms: 18.07 | reduce_grads: 0.22 | step: 370.59 | _step_clipping: 0.13 | _step_step: 368.68 | _step_zero_grad: 0.57 | _step_check_overflow: 0.58 samples/sec: 15.765 | iteration 27360/ 143000 | elapsed time per iteration (ms): 64952.5 | learning rate: 5.527E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.314895E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 23:31:16,316] [INFO] [logging.py:60:log_dist] [Rank 0] step=27370, skipped=33, lr=[0.0005527069132997674, 0.0005527069132997674], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27370 loss: 2.3039 iter time (s): 64.601 samples/sec: 15.851 %comms: 0.002808319159508034 %optimizer_step 0.055286792813535296 %forward: 22.52098310212812 %backward: 60.39593132677686 [2025-04-13 23:31:16,316] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36496.05 | forward: 145487.50 | backward_microstep: 390173.26 | backward: 390162.95 | backward_inner_microstep: 390145.70 | backward_inner: 390139.16 | backward_allreduce_microstep: 8.20 | backward_allreduce: 2.81 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.22 | step: 357.16 | _step_clipping: 0.12 | _step_step: 355.33 | _step_zero_grad: 0.57 | _step_check_overflow: 0.50 samples/sec: 15.851 | iteration 27370/ 143000 | elapsed time per iteration (ms): 64601.5 | learning rate: 5.527E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.310557E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 23:41:57,818] [INFO] [logging.py:60:log_dist] [Rank 0] step=27380, skipped=33, lr=[0.0005526713882520832, 0.0005526713882520832], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27380 loss: 2.3080 iter time (s): 64.150 samples/sec: 15.963 %comms: 0.002805028384872487 %optimizer_step 0.05692844001302 %forward: 22.657891454019556 %backward: 60.818894671200496 [2025-04-13 23:41:57,819] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32108.09 | forward: 145349.63 | backward_microstep: 390160.79 | backward: 390151.22 | backward_inner_microstep: 390133.24 | backward_inner: 390126.65 | backward_allreduce_microstep: 8.74 | backward_allreduce: 3.00 | reduce_tied_grads: 0.32 | comms: 17.99 | reduce_grads: 0.23 | step: 365.19 | _step_clipping: 0.13 | _step_step: 363.38 | _step_zero_grad: 0.52 | _step_check_overflow: 0.55 samples/sec: 15.963 | iteration 27380/ 143000 | elapsed time per iteration (ms): 64150.2 | learning rate: 5.527E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.306437E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-13 23:52:31,494] [INFO] [logging.py:60:log_dist] [Rank 0] step=27390, skipped=33, lr=[0.0005526358510093447, 0.0005526358510093447], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27390 loss: 2.3083 iter time (s): 63.367 samples/sec: 16.160 %comms: 0.002837262270285291 %optimizer_step 0.05634152934846912 %forward: 22.916292318475616 %backward: 61.539506890702796 [2025-04-13 23:52:31,495] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24659.12 | forward: 145213.89 | backward_microstep: 389967.41 | backward: 389957.97 | backward_inner_microstep: 389940.96 | backward_inner: 389934.58 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.76 | reduce_tied_grads: 0.32 | comms: 17.98 | reduce_grads: 0.20 | step: 357.02 | _step_clipping: 0.15 | _step_step: 355.18 | _step_zero_grad: 0.50 | _step_check_overflow: 0.59 samples/sec: 16.160 | iteration 27390/ 143000 | elapsed time per iteration (ms): 63367.6 | learning rate: 5.526E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.308297E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 00:03:13,328] [INFO] [logging.py:60:log_dist] [Rank 0] step=27400, skipped=33, lr=[0.0005526003015732671, 0.0005526003015732671], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27400 loss: 2.3113 iter time (s): 64.183 samples/sec: 15.954 %comms: 0.0028189265432404503 %optimizer_step 0.05712053937364104 %forward: 22.647068665371197 %backward: 60.788086059675074 [2025-04-14 00:03:13,329] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32480.07 | forward: 145355.01 | backward_microstep: 390165.24 | backward: 390154.36 | backward_inner_microstep: 390135.34 | backward_inner: 390126.87 | backward_allreduce_microstep: 9.23 | backward_allreduce: 3.20 | reduce_tied_grads: 0.35 | comms: 18.09 | reduce_grads: 0.25 | step: 366.62 | _step_clipping: 0.14 | _step_step: 364.64 | _step_zero_grad: 0.62 | _step_check_overflow: 0.58 samples/sec: 15.954 | iteration 27400/ 143000 | elapsed time per iteration (ms): 64183.4 | learning rate: 5.526E-04 | approx flops per GPU: 68.8TFLOPS | lm_loss: 2.330530E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 00:13:59,446] [INFO] [logging.py:60:log_dist] [Rank 0] step=27410, skipped=33, lr=[0.0005525647399455665, 0.0005525647399455665], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27410 loss: 2.3155 iter time (s): 64.611 samples/sec: 15.849 %comms: 0.002787906226774056 %optimizer_step 0.0561421110903812 %forward: 22.53777995318662 %backward: 60.37850301582527 [2025-04-14 00:13:59,447] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36601.93 | forward: 145619.33 | backward_microstep: 390121.41 | backward: 390112.84 | backward_inner_microstep: 390095.14 | backward_inner: 390088.65 | backward_allreduce_microstep: 8.67 | backward_allreduce: 2.95 | reduce_tied_grads: 0.31 | comms: 18.01 | reduce_grads: 0.22 | step: 362.74 | _step_clipping: 0.13 | _step_step: 360.98 | _step_zero_grad: 0.55 | _step_check_overflow: 0.47 samples/sec: 15.848 | iteration 27410/ 143000 | elapsed time per iteration (ms): 64611.8 | learning rate: 5.526E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.321065E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 00:24:39,244] [INFO] [logging.py:60:log_dist] [Rank 0] step=27420, skipped=33, lr=[0.0005525291661279588, 0.0005525291661279588], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27420 loss: 2.2932 iter time (s): 63.979 samples/sec: 16.005 %comms: 0.002802625355649936 %optimizer_step 0.05785371039676295 %forward: 22.721638523358187 %backward: 60.97668841122423 [2025-04-14 00:24:39,245] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30471.46 | forward: 145371.32 | backward_microstep: 390133.28 | backward: 390124.23 | backward_inner_microstep: 390106.38 | backward_inner: 390099.81 | backward_allreduce_microstep: 8.83 | backward_allreduce: 2.97 | reduce_tied_grads: 0.29 | comms: 17.93 | reduce_grads: 0.21 | step: 370.14 | _step_clipping: 0.12 | _step_step: 366.59 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 16.005 | iteration 27420/ 143000 | elapsed time per iteration (ms): 63979.8 | learning rate: 5.525E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.302559E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 00:35:23,772] [INFO] [logging.py:60:log_dist] [Rank 0] step=27430, skipped=33, lr=[0.0005524935801221612, 0.0005524935801221612], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27430 loss: 2.3089 iter time (s): 64.452 samples/sec: 15.888 %comms: 0.0027843903240928835 %optimizer_step 0.055435598151562265 %forward: 22.60148475909647 %backward: 60.52781606710591 [2025-04-14 00:35:23,773] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34888.12 | forward: 145671.51 | backward_microstep: 390124.46 | backward: 390115.02 | backward_inner_microstep: 390096.92 | backward_inner: 390090.25 | backward_allreduce_microstep: 8.79 | backward_allreduce: 3.02 | reduce_tied_grads: 0.29 | comms: 17.95 | reduce_grads: 0.20 | step: 357.29 | _step_clipping: 0.12 | _step_step: 355.52 | _step_zero_grad: 0.53 | _step_check_overflow: 0.54 samples/sec: 15.888 | iteration 27430/ 143000 | elapsed time per iteration (ms): 64452.8 | learning rate: 5.525E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.306944E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 00:46:02,929] [INFO] [logging.py:60:log_dist] [Rank 0] step=27440, skipped=33, lr=[0.0005524579819298911, 0.0005524579819298911], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27440 loss: 2.3188 iter time (s): 63.915 samples/sec: 16.021 %comms: 0.0028302834502539924 %optimizer_step 0.059442480377992386 %forward: 22.80596033582342 %backward: 61.03726318610675 [2025-04-14 00:46:02,929] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29355.87 | forward: 145764.41 | backward_microstep: 390130.15 | backward: 390119.98 | backward_inner_microstep: 390100.63 | backward_inner: 390093.60 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.29 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.24 | step: 379.93 | _step_clipping: 0.14 | _step_step: 377.97 | _step_zero_grad: 0.56 | _step_check_overflow: 0.61 samples/sec: 16.021 | iteration 27440/ 143000 | elapsed time per iteration (ms): 63915.7 | learning rate: 5.525E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.307846E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 00:56:46,557] [INFO] [logging.py:60:log_dist] [Rank 0] step=27450, skipped=33, lr=[0.0005524223715528669, 0.0005524223715528669], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27450 loss: 2.3141 iter time (s): 64.362 samples/sec: 15.910 %comms: 0.002818359258435407 %optimizer_step 0.05629632149007577 %forward: 22.58732007697178 %backward: 60.60249794637578 [2025-04-14 00:56:46,558] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34316.01 | forward: 145377.13 | backward_microstep: 390059.90 | backward: 390051.45 | backward_inner_microstep: 390034.82 | backward_inner: 390028.49 | backward_allreduce_microstep: 7.95 | backward_allreduce: 2.74 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.21 | step: 362.34 | _step_clipping: 0.12 | _step_step: 360.48 | _step_zero_grad: 0.54 | _step_check_overflow: 0.59 samples/sec: 15.910 | iteration 27450/ 143000 | elapsed time per iteration (ms): 64362.9 | learning rate: 5.524E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.313692E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 01:07:23,867] [INFO] [logging.py:60:log_dist] [Rank 0] step=27460, skipped=33, lr=[0.0005523867489928071, 0.0005523867489928071], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27460 loss: 2.3007 iter time (s): 63.730 samples/sec: 16.068 %comms: 0.002852065669437356 %optimizer_step 0.05563482754974286 %forward: 22.851863966887894 %backward: 61.21209731518916 [2025-04-14 01:07:23,867] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27723.26 | forward: 145635.75 | backward_microstep: 390116.48 | backward: 390106.90 | backward_inner_microstep: 390088.25 | backward_inner: 390081.91 | backward_allreduce_microstep: 8.14 | backward_allreduce: 2.80 | reduce_tied_grads: 0.29 | comms: 18.18 | reduce_grads: 0.19 | step: 354.56 | _step_clipping: 0.11 | _step_step: 352.87 | _step_zero_grad: 0.52 | _step_check_overflow: 0.48 samples/sec: 16.068 | iteration 27460/ 143000 | elapsed time per iteration (ms): 63730.9 | learning rate: 5.524E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.311630E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 01:18:05,416] [INFO] [logging.py:60:log_dist] [Rank 0] step=27470, skipped=33, lr=[0.0005523511142514311, 0.0005523511142514311], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27470 loss: 2.3090 iter time (s): 64.154 samples/sec: 15.962 %comms: 0.0028233704675524906 %optimizer_step 0.056198607179304874 %forward: 22.70067564379107 %backward: 60.81608545969732 [2025-04-14 01:18:05,417] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31892.43 | forward: 145634.60 | backward_microstep: 390173.79 | backward: 390161.35 | backward_inner_microstep: 390143.29 | backward_inner: 390136.58 | backward_allreduce_microstep: 8.70 | backward_allreduce: 2.98 | reduce_tied_grads: 0.34 | comms: 18.11 | reduce_grads: 0.22 | step: 360.54 | _step_clipping: 0.13 | _step_step: 358.69 | _step_zero_grad: 0.55 | _step_check_overflow: 0.55 samples/sec: 15.961 | iteration 27470/ 143000 | elapsed time per iteration (ms): 64154.9 | learning rate: 5.524E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.306352E+00 | loss scale: 262144.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 01:23:21,008] [INFO] [stage1.py:697:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 262144.0, reducing to 131072.0 [2025-04-14 01:28:38,328] [INFO] [logging.py:60:log_dist] [Rank 0] step=27480, skipped=34, lr=[0.0005523190325705888, 0.0005523190325705888], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27480 loss: 2.3276 iter time (s): 63.291 samples/sec: 16.179 %comms: 0.0025995989167209997 %optimizer_step 0.05117633811727667 %forward: 22.969617770328245 %backward: 61.652667619249954 [2025-04-14 01:28:38,329] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23434.60 | forward: 145376.18 | backward_microstep: 390213.84 | backward: 390203.67 | backward_inner_microstep: 390185.49 | backward_inner: 390178.70 | backward_allreduce_microstep: 8.88 | backward_allreduce: 3.07 | reduce_tied_grads: 0.30 | comms: 16.45 | reduce_grads: 0.21 | step: 323.90 | _step_clipping: 0.12 | _step_step: 322.16 | _step_zero_grad: 0.55 | _step_check_overflow: 0.45 samples/sec: 16.179 | iteration 27480/ 143000 | elapsed time per iteration (ms): 63291.2 | learning rate: 5.523E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.315846E+00 | loss scale: 131072.0 | number of skipped iterations: 1 | number of nan iterations: 0 | time (ms) [2025-04-14 01:39:05,671] [INFO] [logging.py:60:log_dist] [Rank 0] step=27490, skipped=34, lr=[0.0005522833746894507, 0.0005522833746894507], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27490 loss: 2.3132 iter time (s): 62.734 samples/sec: 16.323 %comms: 0.002916379739079736 %optimizer_step 0.05680956624362658 %forward: 23.17918481640242 %backward: 62.21344444591287 [2025-04-14 01:39:05,671] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17630.22 | forward: 145411.58 | backward_microstep: 390299.01 | backward: 390287.90 | backward_inner_microstep: 390270.46 | backward_inner: 390263.90 | backward_allreduce_microstep: 8.27 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 18.30 | reduce_grads: 0.24 | step: 356.39 | _step_clipping: 0.12 | _step_step: 354.42 | _step_zero_grad: 0.50 | _step_check_overflow: 0.51 samples/sec: 16.323 | iteration 27490/ 143000 | elapsed time per iteration (ms): 62734.3 | learning rate: 5.523E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.311224E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 01:49:45,346] [INFO] [logging.py:60:log_dist] [Rank 0] step=27500, skipped=34, lr=[0.0005522477046319856, 0.0005522477046319856], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27500 loss: 2.3013 iter time (s): 63.967 samples/sec: 16.008 %comms: 0.002856800660207762 %optimizer_step 0.057561326032232414 %forward: 22.740070265272237 %backward: 61.013290179426235 [2025-04-14 01:49:45,347] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29904.15 | forward: 145461.18 | backward_microstep: 390294.12 | backward: 390283.12 | backward_inner_microstep: 390263.81 | backward_inner: 390256.63 | backward_allreduce_microstep: 9.49 | backward_allreduce: 3.25 | reduce_tied_grads: 0.35 | comms: 18.27 | reduce_grads: 0.27 | step: 368.20 | _step_clipping: 0.12 | _step_step: 366.17 | _step_zero_grad: 0.63 | _step_check_overflow: 0.60 samples/sec: 16.008 | iteration 27500/ 143000 | elapsed time per iteration (ms): 63967.5 | learning rate: 5.522E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.306355E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 02:00:17,313] [INFO] [logging.py:60:log_dist] [Rank 0] step=27510, skipped=34, lr=[0.0005522120223999152, 0.0005522120223999152], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27510 loss: 2.3319 iter time (s): 63.196 samples/sec: 16.204 %comms: 0.002871764938452171 %optimizer_step 0.05684321417965734 %forward: 22.990568284520556 %backward: 61.73922642622725 [2025-04-14 02:00:17,313] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22494.82 | forward: 145291.33 | backward_microstep: 390176.77 | backward: 390167.57 | backward_inner_microstep: 390149.84 | backward_inner: 390143.27 | backward_allreduce_microstep: 8.62 | backward_allreduce: 2.97 | reduce_tied_grads: 0.29 | comms: 18.15 | reduce_grads: 0.21 | step: 359.23 | _step_clipping: 0.13 | _step_step: 357.45 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.203 | iteration 27510/ 143000 | elapsed time per iteration (ms): 63196.7 | learning rate: 5.522E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.321421E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 02:10:53,566] [INFO] [logging.py:60:log_dist] [Rank 0] step=27520, skipped=34, lr=[0.0005521763279949617, 0.0005521763279949617], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27520 loss: 2.3089 iter time (s): 63.625 samples/sec: 16.094 %comms: 0.002829069592288784 %optimizer_step 0.0573984446212537 %forward: 22.873325261118715 %backward: 61.32776248734182 [2025-04-14 02:10:53,567] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26531.11 | forward: 145530.98 | backward_microstep: 390207.05 | backward: 390196.42 | backward_inner_microstep: 390178.67 | backward_inner: 390172.06 | backward_allreduce_microstep: 8.60 | backward_allreduce: 2.97 | reduce_tied_grads: 0.34 | comms: 18.00 | reduce_grads: 0.21 | step: 365.20 | _step_clipping: 0.13 | _step_step: 363.35 | _step_zero_grad: 0.53 | _step_check_overflow: 0.57 samples/sec: 16.094 | iteration 27520/ 143000 | elapsed time per iteration (ms): 63625.3 | learning rate: 5.522E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.321305E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 02:21:30,625] [INFO] [logging.py:60:log_dist] [Rank 0] step=27530, skipped=34, lr=[0.0005521406214188478, 0.0005521406214188478], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27530 loss: 2.3292 iter time (s): 63.705 samples/sec: 16.074 %comms: 0.0028173334969030226 %optimizer_step 0.05513487858461231 %forward: 22.83301392507023 %backward: 61.25818626689244 [2025-04-14 02:21:30,625] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27332.98 | forward: 145458.44 | backward_microstep: 390257.08 | backward: 390247.21 | backward_inner_microstep: 390229.89 | backward_inner: 390223.45 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.91 | reduce_tied_grads: 0.30 | comms: 17.95 | reduce_grads: 0.20 | step: 351.24 | _step_clipping: 0.13 | _step_step: 349.45 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.074 | iteration 27530/ 143000 | elapsed time per iteration (ms): 63705.9 | learning rate: 5.521E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.315588E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 02:31:57,836] [INFO] [logging.py:60:log_dist] [Rank 0] step=27540, skipped=34, lr=[0.0005521049026732969, 0.0005521049026732969], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27540 loss: 2.3153 iter time (s): 62.721 samples/sec: 16.326 %comms: 0.002891522077112604 %optimizer_step 0.05898435906460514 %forward: 23.179518291800893 %backward: 62.22161145822736 [2025-04-14 02:31:57,837] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17553.01 | forward: 145383.22 | backward_microstep: 390266.75 | backward: 390257.38 | backward_inner_microstep: 390239.93 | backward_inner: 390233.48 | backward_allreduce_microstep: 8.36 | backward_allreduce: 2.88 | reduce_tied_grads: 0.38 | comms: 18.14 | reduce_grads: 0.26 | step: 369.95 | _step_clipping: 0.15 | _step_step: 368.08 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.326 | iteration 27540/ 143000 | elapsed time per iteration (ms): 62721.2 | learning rate: 5.521E-04 | approx flops per GPU: 70.4TFLOPS | lm_loss: 2.314869E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 02:42:28,236] [INFO] [logging.py:60:log_dist] [Rank 0] step=27550, skipped=34, lr=[0.0005520691717600328, 0.0005520691717600328], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27550 loss: 2.2911 iter time (s): 63.039 samples/sec: 16.244 %comms: 0.002904165928220038 %optimizer_step 0.05830025735385054 %forward: 23.071489576007963 %backward: 61.926714136020465 [2025-04-14 02:42:28,237] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20547.30 | forward: 145441.27 | backward_microstep: 390392.81 | backward: 390382.25 | backward_inner_microstep: 390364.57 | backward_inner: 390357.92 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.93 | reduce_tied_grads: 0.36 | comms: 18.31 | reduce_grads: 0.25 | step: 367.52 | _step_clipping: 0.13 | _step_step: 365.51 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.244 | iteration 27550/ 143000 | elapsed time per iteration (ms): 63040.0 | learning rate: 5.521E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.309403E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 02:53:05,786] [INFO] [logging.py:60:log_dist] [Rank 0] step=27560, skipped=34, lr=[0.0005520334286807804, 0.0005520334286807804], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27560 loss: 2.3137 iter time (s): 63.754 samples/sec: 16.062 %comms: 0.003031803494123236 %optimizer_step 0.05734858698621349 %forward: 22.811244688858295 %backward: 61.209005426440974 [2025-04-14 02:53:05,786] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27889.03 | forward: 145431.64 | backward_microstep: 390243.58 | backward: 390234.14 | backward_inner_microstep: 390215.23 | backward_inner: 390208.48 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.36 | reduce_tied_grads: 0.32 | comms: 19.33 | reduce_grads: 0.21 | step: 365.62 | _step_clipping: 0.11 | _step_step: 363.84 | _step_zero_grad: 0.53 | _step_check_overflow: 0.56 samples/sec: 16.061 | iteration 27560/ 143000 | elapsed time per iteration (ms): 63755.0 | learning rate: 5.520E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.311392E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 03:03:35,098] [INFO] [logging.py:60:log_dist] [Rank 0] step=27570, skipped=34, lr=[0.0005519976734372643, 0.0005519976734372643], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27570 loss: 2.3152 iter time (s): 62.931 samples/sec: 16.272 %comms: 0.002843072392551499 %optimizer_step 0.055642127702823244 %forward: 23.092872045699547 %backward: 61.98554674103853 [2025-04-14 03:03:35,099] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19941.55 | forward: 145324.99 | backward_microstep: 390087.62 | backward: 390079.21 | backward_inner_microstep: 390062.81 | backward_inner: 390056.59 | backward_allreduce_microstep: 7.90 | backward_allreduce: 2.70 | reduce_tied_grads: 0.28 | comms: 17.89 | reduce_grads: 0.20 | step: 350.16 | _step_clipping: 0.13 | _step_step: 348.51 | _step_zero_grad: 0.47 | _step_check_overflow: 0.49 samples/sec: 16.272 | iteration 27570/ 143000 | elapsed time per iteration (ms): 62931.2 | learning rate: 5.520E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.315656E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 03:14:17,640] [INFO] [logging.py:60:log_dist] [Rank 0] step=27580, skipped=34, lr=[0.0005519619060312107, 0.0005519619060312107], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27580 loss: 2.3296 iter time (s): 64.254 samples/sec: 15.937 %comms: 0.0028056463403285056 %optimizer_step 0.05655348437678013 %forward: 22.65990392948488 %backward: 60.72745173229387 [2025-04-14 03:14:17,641] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32765.12 | forward: 145598.18 | backward_microstep: 390207.14 | backward: 390196.13 | backward_inner_microstep: 390177.99 | backward_inner: 390169.51 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.04 | reduce_tied_grads: 0.35 | comms: 18.03 | reduce_grads: 0.24 | step: 363.38 | _step_clipping: 0.16 | _step_step: 361.44 | _step_zero_grad: 0.56 | _step_check_overflow: 0.57 samples/sec: 15.937 | iteration 27580/ 143000 | elapsed time per iteration (ms): 64254.2 | learning rate: 5.520E-04 | approx flops per GPU: 68.7TFLOPS | lm_loss: 2.319370E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 03:25:08,359] [INFO] [logging.py:60:log_dist] [Rank 0] step=27590, skipped=34, lr=[0.0005519261264643455, 0.0005519261264643455], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27590 loss: 2.3368 iter time (s): 65.071 samples/sec: 15.737 %comms: 0.002775304770240269 %optimizer_step 0.05443750084371888 %forward: 22.34427830738942 %backward: 59.96046055031079 [2025-04-14 03:25:08,359] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 41192.18 | forward: 145397.00 | backward_microstep: 390180.06 | backward: 390170.18 | backward_inner_microstep: 390152.25 | backward_inner: 390145.62 | backward_allreduce_microstep: 8.70 | backward_allreduce: 2.98 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.20 | step: 354.23 | _step_clipping: 0.13 | _step_step: 352.56 | _step_zero_grad: 0.50 | _step_check_overflow: 0.47 samples/sec: 15.736 | iteration 27590/ 143000 | elapsed time per iteration (ms): 65071.8 | learning rate: 5.519E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.320261E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 03:35:38,751] [INFO] [logging.py:60:log_dist] [Rank 0] step=27600, skipped=34, lr=[0.0005518903347383958, 0.0005518903347383958], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27600 loss: 2.3344 iter time (s): 63.039 samples/sec: 16.244 %comms: 0.002863086961027288 %optimizer_step 0.05605177002519584 %forward: 23.076993569434677 %backward: 61.888041083643216 [2025-04-14 03:35:38,752] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20821.59 | forward: 145474.34 | backward_microstep: 390142.72 | backward: 390134.10 | backward_inner_microstep: 390117.04 | backward_inner: 390110.60 | backward_allreduce_microstep: 8.26 | backward_allreduce: 2.82 | reduce_tied_grads: 0.29 | comms: 18.05 | reduce_grads: 0.21 | step: 353.34 | _step_clipping: 0.12 | _step_step: 351.63 | _step_zero_grad: 0.50 | _step_check_overflow: 0.50 samples/sec: 16.244 | iteration 27600/ 143000 | elapsed time per iteration (ms): 63039.2 | learning rate: 5.519E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.318931E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 03:46:10,063] [INFO] [logging.py:60:log_dist] [Rank 0] step=27610, skipped=34, lr=[0.0005518545308550892, 0.0005518545308550892], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27610 loss: 2.3035 iter time (s): 63.131 samples/sec: 16.220 %comms: 0.0028468707571803744 %optimizer_step 0.05554827122249415 %forward: 23.023450946609756 %backward: 61.8006422945581 [2025-04-14 03:46:10,063] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21857.35 | forward: 145348.45 | backward_microstep: 390160.55 | backward: 390151.24 | backward_inner_microstep: 390134.07 | backward_inner: 390127.72 | backward_allreduce_microstep: 8.23 | backward_allreduce: 2.81 | reduce_tied_grads: 0.28 | comms: 17.97 | reduce_grads: 0.20 | step: 350.68 | _step_clipping: 0.11 | _step_step: 348.92 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 16.220 | iteration 27610/ 143000 | elapsed time per iteration (ms): 63131.2 | learning rate: 5.519E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.316440E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 03:56:46,204] [INFO] [logging.py:60:log_dist] [Rank 0] step=27620, skipped=34, lr=[0.0005518187148161534, 0.0005518187148161534], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27620 loss: 2.3055 iter time (s): 63.614 samples/sec: 16.097 %comms: 0.002834554441059196 %optimizer_step 0.05651561104922784 %forward: 22.870155980096403 %backward: 61.33189076362676 [2025-04-14 03:56:46,204] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26532.18 | forward: 145485.10 | backward_microstep: 390163.77 | backward: 390153.72 | backward_inner_microstep: 390136.23 | backward_inner: 390129.70 | backward_allreduce_microstep: 8.42 | backward_allreduce: 2.91 | reduce_tied_grads: 0.32 | comms: 18.03 | reduce_grads: 0.21 | step: 359.52 | _step_clipping: 0.14 | _step_step: 357.72 | _step_zero_grad: 0.51 | _step_check_overflow: 0.53 samples/sec: 16.097 | iteration 27620/ 143000 | elapsed time per iteration (ms): 63614.1 | learning rate: 5.518E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.309481E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 04:07:35,880] [INFO] [logging.py:60:log_dist] [Rank 0] step=27630, skipped=34, lr=[0.0005517828866233174, 0.0005517828866233174], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27630 loss: 2.3033 iter time (s): 64.967 samples/sec: 15.762 %comms: 0.0028177411912594977 %optimizer_step 0.05678041537251334 %forward: 22.412758146700867 %backward: 60.06744924718318 [2025-04-14 04:07:35,881] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39839.49 | forward: 145608.95 | backward_microstep: 390249.46 | backward: 390240.15 | backward_inner_microstep: 390222.11 | backward_inner: 390215.51 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.05 | reduce_tied_grads: 0.39 | comms: 18.31 | reduce_grads: 0.24 | step: 368.89 | _step_clipping: 0.17 | _step_step: 366.97 | _step_zero_grad: 0.56 | _step_check_overflow: 0.52 samples/sec: 15.762 | iteration 27630/ 143000 | elapsed time per iteration (ms): 64967.6 | learning rate: 5.518E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.321435E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 04:18:21,482] [INFO] [logging.py:60:log_dist] [Rank 0] step=27640, skipped=34, lr=[0.0005517470462783101, 0.0005517470462783101], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27640 loss: 2.3111 iter time (s): 64.560 samples/sec: 15.861 %comms: 0.0027971498029538103 %optimizer_step 0.055659706257665385 %forward: 22.532816221721035 %backward: 60.44227193455867 [2025-04-14 04:18:21,483] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35919.18 | forward: 145471.06 | backward_microstep: 390222.47 | backward: 390213.17 | backward_inner_microstep: 390195.01 | backward_inner: 390188.34 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.02 | reduce_tied_grads: 0.32 | comms: 18.06 | reduce_grads: 0.22 | step: 359.34 | _step_clipping: 0.13 | _step_step: 357.46 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 15.861 | iteration 27640/ 143000 | elapsed time per iteration (ms): 64560.2 | learning rate: 5.517E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.312085E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 04:29:11,068] [INFO] [logging.py:60:log_dist] [Rank 0] step=27650, skipped=34, lr=[0.0005517111937828616, 0.0005517111937828616], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27650 loss: 2.2989 iter time (s): 64.958 samples/sec: 15.764 %comms: 0.002810499999221518 %optimizer_step 0.0573155839497517 %forward: 22.420206492327157 %backward: 60.10026704752684 [2025-04-14 04:29:11,068] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 39504.10 | forward: 145637.02 | backward_microstep: 390411.61 | backward: 390398.89 | backward_inner_microstep: 390376.69 | backward_inner: 390369.36 | backward_allreduce_microstep: 11.66 | backward_allreduce: 3.42 | reduce_tied_grads: 0.40 | comms: 18.26 | reduce_grads: 0.25 | step: 372.31 | _step_clipping: 0.16 | _step_step: 370.24 | _step_zero_grad: 0.61 | _step_check_overflow: 0.62 samples/sec: 15.764 | iteration 27650/ 143000 | elapsed time per iteration (ms): 64958.5 | learning rate: 5.517E-04 | approx flops per GPU: 68.0TFLOPS | lm_loss: 2.311823E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 04:39:50,054] [INFO] [logging.py:60:log_dist] [Rank 0] step=27660, skipped=34, lr=[0.0005516753291387021, 0.0005516753291387021], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27660 loss: 2.3143 iter time (s): 63.898 samples/sec: 16.026 %comms: 0.0028359976889911647 %optimizer_step 0.057047220159632275 %forward: 22.79634948087989 %backward: 61.07807859500277 [2025-04-14 04:39:50,055] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29050.84 | forward: 145664.30 | backward_microstep: 390287.12 | backward: 390277.20 | backward_inner_microstep: 390259.16 | backward_inner: 390252.40 | backward_allreduce_microstep: 8.68 | backward_allreduce: 2.98 | reduce_tied_grads: 0.34 | comms: 18.12 | reduce_grads: 0.23 | step: 364.52 | _step_clipping: 0.14 | _step_step: 362.57 | _step_zero_grad: 0.57 | _step_check_overflow: 0.57 samples/sec: 16.025 | iteration 27660/ 143000 | elapsed time per iteration (ms): 63898.7 | learning rate: 5.517E-04 | approx flops per GPU: 69.1TFLOPS | lm_loss: 2.314947E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 04:50:25,542] [INFO] [logging.py:60:log_dist] [Rank 0] step=27670, skipped=34, lr=[0.0005516394523475627, 0.0005516394523475627], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27670 loss: 2.3198 iter time (s): 63.548 samples/sec: 16.114 %comms: 0.002864032585455893 %optimizer_step 0.05871593204904294 %forward: 22.908566607611235 %backward: 61.428073548820514 [2025-04-14 04:50:25,543] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25519.91 | forward: 145579.70 | backward_microstep: 390376.01 | backward: 390364.05 | backward_inner_microstep: 390344.80 | backward_inner: 390337.87 | backward_allreduce_microstep: 9.45 | backward_allreduce: 3.29 | reduce_tied_grads: 0.34 | comms: 18.20 | reduce_grads: 0.22 | step: 373.13 | _step_clipping: 0.14 | _step_step: 371.21 | _step_zero_grad: 0.51 | _step_check_overflow: 0.65 samples/sec: 16.114 | iteration 27670/ 143000 | elapsed time per iteration (ms): 63548.7 | learning rate: 5.516E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.313338E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 05:01:05,858] [INFO] [logging.py:60:log_dist] [Rank 0] step=27680, skipped=34, lr=[0.0005516035634111749, 0.0005516035634111749], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27680 loss: 2.3196 iter time (s): 64.031 samples/sec: 15.992 %comms: 0.0028165180531833124 %optimizer_step 0.05594471978663999 %forward: 22.718239594791605 %backward: 60.940764340355734 [2025-04-14 05:01:05,859] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30661.96 | forward: 145467.25 | backward_microstep: 390219.95 | backward: 390210.06 | backward_inner_microstep: 390190.77 | backward_inner: 390184.03 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.93 | reduce_tied_grads: 0.30 | comms: 18.03 | reduce_grads: 0.21 | step: 358.22 | _step_clipping: 0.13 | _step_step: 356.42 | _step_zero_grad: 0.55 | _step_check_overflow: 0.52 samples/sec: 15.992 | iteration 27680/ 143000 | elapsed time per iteration (ms): 64031.6 | learning rate: 5.516E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.309587E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 05:11:42,601] [INFO] [logging.py:60:log_dist] [Rank 0] step=27690, skipped=34, lr=[0.000551567662331271, 0.000551567662331271], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27690 loss: 2.3425 iter time (s): 63.674 samples/sec: 16.082 %comms: 0.0028313117532767222 %optimizer_step 0.06114558750467007 %forward: 22.831335971916435 %backward: 61.26892553300214 [2025-04-14 05:11:42,602] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27244.20 | forward: 145375.66 | backward_microstep: 390130.61 | backward: 390122.16 | backward_inner_microstep: 390104.63 | backward_inner: 390098.14 | backward_allreduce_microstep: 8.38 | backward_allreduce: 2.87 | reduce_tied_grads: 0.33 | comms: 18.03 | reduce_grads: 0.21 | step: 389.34 | _step_clipping: 0.14 | _step_step: 387.52 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.082 | iteration 27690/ 143000 | elapsed time per iteration (ms): 63674.3 | learning rate: 5.516E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.329826E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 05:22:14,437] [INFO] [logging.py:60:log_dist] [Rank 0] step=27700, skipped=34, lr=[0.0005515317491095836, 0.0005515317491095836], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27700 loss: 2.3086 iter time (s): 63.183 samples/sec: 16.207 %comms: 0.0028424737903439837 %optimizer_step 0.05653227855377815 %forward: 23.021157380167605 %backward: 61.751986025452766 [2025-04-14 05:22:14,438] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22246.09 | forward: 145454.52 | backward_microstep: 390176.54 | backward: 390167.41 | backward_inner_microstep: 390149.86 | backward_inner: 390143.40 | backward_allreduce_microstep: 8.29 | backward_allreduce: 2.85 | reduce_tied_grads: 0.30 | comms: 17.96 | reduce_grads: 0.24 | step: 357.19 | _step_clipping: 0.13 | _step_step: 355.40 | _step_zero_grad: 0.53 | _step_check_overflow: 0.55 samples/sec: 16.207 | iteration 27700/ 143000 | elapsed time per iteration (ms): 63183.5 | learning rate: 5.515E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.319095E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 05:32:50,232] [INFO] [logging.py:60:log_dist] [Rank 0] step=27710, skipped=34, lr=[0.0005514958237478461, 0.0005514958237478461], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27710 loss: 2.3037 iter time (s): 63.579 samples/sec: 16.106 %comms: 0.0028341090593613258 %optimizer_step 0.05589337660297912 %forward: 22.892377647128765 %backward: 61.372952302252024 [2025-04-14 05:32:50,233] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26080.92 | forward: 145547.28 | backward_microstep: 390211.21 | backward: 390202.64 | backward_inner_microstep: 390185.90 | backward_inner: 390179.60 | backward_allreduce_microstep: 8.13 | backward_allreduce: 2.74 | reduce_tied_grads: 0.35 | comms: 18.02 | reduce_grads: 0.20 | step: 355.36 | _step_clipping: 0.12 | _step_step: 353.69 | _step_zero_grad: 0.49 | _step_check_overflow: 0.49 samples/sec: 16.106 | iteration 27710/ 143000 | elapsed time per iteration (ms): 63579.5 | learning rate: 5.515E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.307635E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 05:43:28,987] [INFO] [logging.py:60:log_dist] [Rank 0] step=27720, skipped=34, lr=[0.0005514598862477924, 0.0005514598862477924], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27720 loss: 2.3272 iter time (s): 63.875 samples/sec: 16.031 %comms: 0.002846468851698274 %optimizer_step 0.05652498884173759 %forward: 22.763414198340715 %backward: 61.083213837630325 [2025-04-14 05:43:28,988] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29212.24 | forward: 145401.16 | backward_microstep: 390178.21 | backward: 390168.62 | backward_inner_microstep: 390151.69 | backward_inner: 390145.22 | backward_allreduce_microstep: 8.04 | backward_allreduce: 2.76 | reduce_tied_grads: 0.34 | comms: 18.18 | reduce_grads: 0.20 | step: 361.05 | _step_clipping: 0.11 | _step_step: 359.25 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.031 | iteration 27720/ 143000 | elapsed time per iteration (ms): 63875.5 | learning rate: 5.515E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.316321E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 05:53:59,721] [INFO] [logging.py:60:log_dist] [Rank 0] step=27730, skipped=34, lr=[0.000551423936611157, 0.000551423936611157], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27730 loss: 2.2968 iter time (s): 63.073 samples/sec: 16.235 %comms: 0.0028412371907052644 %optimizer_step 0.055294428151972996 %forward: 23.028566312197416 %backward: 61.84023644198007 [2025-04-14 05:53:59,722] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21506.62 | forward: 145247.74 | backward_microstep: 390051.55 | backward: 390044.03 | backward_inner_microstep: 390028.36 | backward_inner: 390022.37 | backward_allreduce_microstep: 7.40 | backward_allreduce: 2.54 | reduce_tied_grads: 0.29 | comms: 17.92 | reduce_grads: 0.18 | step: 348.76 | _step_clipping: 0.12 | _step_step: 347.13 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.235 | iteration 27730/ 143000 | elapsed time per iteration (ms): 63073.4 | learning rate: 5.514E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.314400E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 06:04:33,809] [INFO] [logging.py:60:log_dist] [Rank 0] step=27740, skipped=34, lr=[0.000551387974839675, 0.000551387974839675], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27740 loss: 2.3138 iter time (s): 63.408 samples/sec: 16.149 %comms: 0.0028269255970234733 %optimizer_step 0.05650617542804724 %forward: 22.917347788143427 %backward: 61.514835027632046 [2025-04-14 06:04:33,809] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24757.38 | forward: 145314.76 | backward_microstep: 390062.32 | backward: 390054.45 | backward_inner_microstep: 390035.84 | backward_inner: 390029.24 | backward_allreduce_microstep: 9.23 | backward_allreduce: 3.16 | reduce_tied_grads: 0.28 | comms: 17.93 | reduce_grads: 0.21 | step: 358.30 | _step_clipping: 0.12 | _step_step: 356.66 | _step_zero_grad: 0.50 | _step_check_overflow: 0.45 samples/sec: 16.149 | iteration 27740/ 143000 | elapsed time per iteration (ms): 63408.7 | learning rate: 5.514E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.308720E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 06:15:02,827] [INFO] [logging.py:60:log_dist] [Rank 0] step=27750, skipped=34, lr=[0.000551352000935082, 0.000551352000935082], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27750 loss: 2.3097 iter time (s): 62.901 samples/sec: 16.279 %comms: 0.0028531967348500976 %optimizer_step 0.05847353328049368 %forward: 23.10567372788265 %backward: 62.00788539355617 [2025-04-14 06:15:02,827] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19695.08 | forward: 145337.53 | backward_microstep: 390045.16 | backward: 390037.23 | backward_inner_microstep: 390021.54 | backward_inner: 390015.55 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.58 | reduce_tied_grads: 0.31 | comms: 17.95 | reduce_grads: 0.19 | step: 367.81 | _step_clipping: 0.12 | _step_step: 366.03 | _step_zero_grad: 0.51 | _step_check_overflow: 0.58 samples/sec: 16.279 | iteration 27750/ 143000 | elapsed time per iteration (ms): 62901.8 | learning rate: 5.514E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.314993E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 06:25:32,320] [INFO] [logging.py:60:log_dist] [Rank 0] step=27760, skipped=34, lr=[0.0005513160148991145, 0.0005513160148991145], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27760 loss: 2.3121 iter time (s): 62.949 samples/sec: 16.267 %comms: 0.0028550203724716844 %optimizer_step 0.056411346881161235 %forward: 23.06935479649123 %backward: 61.9630901163395 [2025-04-14 06:25:32,321] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20260.67 | forward: 145218.67 | backward_microstep: 390057.63 | backward: 390049.82 | backward_inner_microstep: 390031.09 | backward_inner: 390024.92 | backward_allreduce_microstep: 8.33 | backward_allreduce: 2.88 | reduce_tied_grads: 0.30 | comms: 17.97 | reduce_grads: 0.21 | step: 355.10 | _step_clipping: 0.13 | _step_step: 353.26 | _step_zero_grad: 0.52 | _step_check_overflow: 0.62 samples/sec: 16.267 | iteration 27760/ 143000 | elapsed time per iteration (ms): 62949.3 | learning rate: 5.513E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.309492E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 06:35:55,114] [INFO] [logging.py:60:log_dist] [Rank 0] step=27770, skipped=34, lr=[0.0005512800167335091, 0.0005512800167335091], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27770 loss: 2.2895 iter time (s): 62.279 samples/sec: 16.442 %comms: 0.002871949477770384 %optimizer_step 0.05559710598803943 %forward: 23.305933982082372 %backward: 62.61387978812621 [2025-04-14 06:35:55,115] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 13793.21 | forward: 145146.60 | backward_microstep: 389959.10 | backward: 389951.84 | backward_inner_microstep: 389936.37 | backward_inner: 389930.64 | backward_allreduce_microstep: 7.49 | backward_allreduce: 2.59 | reduce_tied_grads: 0.27 | comms: 17.89 | reduce_grads: 0.18 | step: 346.25 | _step_clipping: 0.11 | _step_step: 344.64 | _step_zero_grad: 0.46 | _step_check_overflow: 0.51 samples/sec: 16.442 | iteration 27770/ 143000 | elapsed time per iteration (ms): 62279.4 | learning rate: 5.513E-04 | approx flops per GPU: 70.9TFLOPS | lm_loss: 2.310826E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 06:46:28,963] [INFO] [logging.py:60:log_dist] [Rank 0] step=27780, skipped=34, lr=[0.0005512440064400034, 0.0005512440064400034], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27780 loss: 2.3094 iter time (s): 63.384 samples/sec: 16.155 %comms: 0.002820505097334742 %optimizer_step 0.054728421296267524 %forward: 22.913990179078287 %backward: 61.52589936629498 [2025-04-14 06:46:28,963] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24736.33 | forward: 145238.76 | backward_microstep: 389984.96 | backward: 389977.71 | backward_inner_microstep: 389959.77 | backward_inner: 389953.48 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.04 | reduce_tied_grads: 0.26 | comms: 17.88 | reduce_grads: 0.19 | step: 346.89 | _step_clipping: 0.12 | _step_step: 345.18 | _step_zero_grad: 0.50 | _step_check_overflow: 0.53 samples/sec: 16.155 | iteration 27780/ 143000 | elapsed time per iteration (ms): 63384.9 | learning rate: 5.512E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.304513E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 06:56:58,641] [INFO] [logging.py:60:log_dist] [Rank 0] step=27790, skipped=34, lr=[0.0005512079840203351, 0.0005512079840203351], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27790 loss: 2.3110 iter time (s): 62.967 samples/sec: 16.262 %comms: 0.0028388842711666936 %optimizer_step 0.05547156996060053 %forward: 23.0834079117248 %backward: 61.931350098737184 [2025-04-14 06:56:58,641] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20452.14 | forward: 145349.85 | backward_microstep: 389971.87 | backward: 389964.63 | backward_inner_microstep: 389949.02 | backward_inner: 389942.94 | backward_allreduce_microstep: 7.54 | backward_allreduce: 2.60 | reduce_tied_grads: 0.27 | comms: 17.88 | reduce_grads: 0.18 | step: 349.29 | _step_clipping: 0.10 | _step_step: 347.68 | _step_zero_grad: 0.47 | _step_check_overflow: 0.50 samples/sec: 16.262 | iteration 27790/ 143000 | elapsed time per iteration (ms): 62967.8 | learning rate: 5.512E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.319907E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 07:07:25,216] [INFO] [logging.py:60:log_dist] [Rank 0] step=27800, skipped=34, lr=[0.0005511719494762433, 0.0005511719494762433], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27800 loss: 2.3032 iter time (s): 62.657 samples/sec: 16.343 %comms: 0.0028936538507847703 %optimizer_step 0.05817212266718683 %forward: 23.180379536800356 %backward: 62.24374462202269 [2025-04-14 07:07:25,217] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 17388.62 | forward: 145241.40 | backward_microstep: 390008.14 | backward: 390000.90 | backward_inner_microstep: 389984.71 | backward_inner: 389978.51 | backward_allreduce_microstep: 7.83 | backward_allreduce: 2.69 | reduce_tied_grads: 0.28 | comms: 18.13 | reduce_grads: 0.20 | step: 364.49 | _step_clipping: 0.12 | _step_step: 362.86 | _step_zero_grad: 0.48 | _step_check_overflow: 0.46 samples/sec: 16.343 | iteration 27800/ 143000 | elapsed time per iteration (ms): 62657.6 | learning rate: 5.512E-04 | approx flops per GPU: 70.5TFLOPS | lm_loss: 2.313110E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 07:17:54,194] [INFO] [logging.py:60:log_dist] [Rank 0] step=27810, skipped=34, lr=[0.0005511359028094667, 0.0005511359028094667], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27810 loss: 2.3189 iter time (s): 62.897 samples/sec: 16.281 %comms: 0.002860014812374926 %optimizer_step 0.05726352361273432 %forward: 23.100357152167827 %backward: 62.01387395270318 [2025-04-14 07:17:54,194] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 19672.34 | forward: 145294.69 | backward_microstep: 390057.22 | backward: 390049.67 | backward_inner_microstep: 390033.89 | backward_inner: 390027.78 | backward_allreduce_microstep: 7.58 | backward_allreduce: 2.61 | reduce_tied_grads: 0.29 | comms: 17.99 | reduce_grads: 0.20 | step: 360.17 | _step_clipping: 0.11 | _step_step: 358.36 | _step_zero_grad: 0.53 | _step_check_overflow: 0.59 samples/sec: 16.280 | iteration 27810/ 143000 | elapsed time per iteration (ms): 62897.7 | learning rate: 5.511E-04 | approx flops per GPU: 70.2TFLOPS | lm_loss: 2.312510E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 07:28:27,361] [INFO] [logging.py:60:log_dist] [Rank 0] step=27820, skipped=34, lr=[0.0005510998440217456, 0.0005510998440217456], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27820 loss: 2.3271 iter time (s): 63.316 samples/sec: 16.173 %comms: 0.0028325038925963958 %optimizer_step 0.056393494917093 %forward: 22.992759147113944 %backward: 61.62726398210926 [2025-04-14 07:28:27,362] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23407.28 | forward: 145581.28 | backward_microstep: 390209.79 | backward: 390200.07 | backward_inner_microstep: 390182.71 | backward_inner: 390176.07 | backward_allreduce_microstep: 8.41 | backward_allreduce: 2.90 | reduce_tied_grads: 0.30 | comms: 17.93 | reduce_grads: 0.20 | step: 357.06 | _step_clipping: 0.12 | _step_step: 355.32 | _step_zero_grad: 0.50 | _step_check_overflow: 0.56 samples/sec: 16.173 | iteration 27820/ 143000 | elapsed time per iteration (ms): 63316.8 | learning rate: 5.511E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.314997E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 07:39:08,307] [INFO] [logging.py:60:log_dist] [Rank 0] step=27830, skipped=34, lr=[0.0005510637731148198, 0.0005510637731148198], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27830 loss: 2.3233 iter time (s): 64.094 samples/sec: 15.977 %comms: 0.0028432867481367812 %optimizer_step 0.057104175855188885 %forward: 22.6980453403124 %backward: 60.89558155928775 [2025-04-14 07:39:08,308] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 31139.06 | forward: 145480.86 | backward_microstep: 390315.39 | backward: 390304.15 | backward_inner_microstep: 390284.92 | backward_inner: 390277.78 | backward_allreduce_microstep: 9.28 | backward_allreduce: 3.17 | reduce_tied_grads: 0.32 | comms: 18.22 | reduce_grads: 0.41 | step: 366.00 | _step_clipping: 0.12 | _step_step: 363.97 | _step_zero_grad: 0.59 | _step_check_overflow: 0.71 samples/sec: 15.976 | iteration 27830/ 143000 | elapsed time per iteration (ms): 64094.6 | learning rate: 5.511E-04 | approx flops per GPU: 68.9TFLOPS | lm_loss: 2.318052E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 07:49:53,393] [INFO] [logging.py:60:log_dist] [Rank 0] step=27840, skipped=34, lr=[0.0005510276900904306, 0.0005510276900904306], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27840 loss: 2.3071 iter time (s): 64.508 samples/sec: 15.874 %comms: 0.0028263376652274998 %optimizer_step 0.05829981625408589 %forward: 22.614444368474395 %backward: 60.504998738387705 [2025-04-14 07:49:53,394] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34867.55 | forward: 145881.00 | backward_microstep: 390317.18 | backward: 390304.96 | backward_inner_microstep: 390285.92 | backward_inner: 390278.86 | backward_allreduce_microstep: 8.99 | backward_allreduce: 3.10 | reduce_tied_grads: 0.36 | comms: 18.23 | reduce_grads: 0.24 | step: 376.08 | _step_clipping: 0.14 | _step_step: 373.59 | _step_zero_grad: 0.68 | _step_check_overflow: 0.79 samples/sec: 15.874 | iteration 27840/ 143000 | elapsed time per iteration (ms): 64508.6 | learning rate: 5.510E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.312607E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 08:00:27,147] [INFO] [logging.py:60:log_dist] [Rank 0] step=27850, skipped=34, lr=[0.0005509915949503194, 0.0005509915949503194], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27850 loss: 2.3272 iter time (s): 63.375 samples/sec: 16.158 %comms: 0.0028680298307064696 %optimizer_step 0.05717403698442069 %forward: 22.966324725048914 %backward: 61.60059555491914 [2025-04-14 08:00:27,148] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23802.76 | forward: 145548.60 | backward_microstep: 390408.23 | backward: 390392.48 | backward_inner_microstep: 390373.15 | backward_inner: 390366.04 | backward_allreduce_microstep: 9.22 | backward_allreduce: 3.18 | reduce_tied_grads: 0.36 | comms: 18.18 | reduce_grads: 0.23 | step: 362.34 | _step_clipping: 0.14 | _step_step: 360.49 | _step_zero_grad: 0.55 | _step_check_overflow: 0.54 samples/sec: 16.158 | iteration 27850/ 143000 | elapsed time per iteration (ms): 63375.4 | learning rate: 5.510E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.330563E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 08:11:05,779] [INFO] [logging.py:60:log_dist] [Rank 0] step=27860, skipped=34, lr=[0.0005509554876962283, 0.0005509554876962283], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27860 loss: 2.3112 iter time (s): 63.863 samples/sec: 16.034 %comms: 0.00282936052810225 %optimizer_step 0.0563417830334221 %forward: 23.238278219860824 %backward: 61.110817986451735 [2025-04-14 08:11:05,780] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25945.41 | forward: 148405.66 | backward_microstep: 390280.41 | backward: 390269.51 | backward_inner_microstep: 390248.79 | backward_inner: 390241.75 | backward_allreduce_microstep: 9.14 | backward_allreduce: 3.14 | reduce_tied_grads: 0.35 | comms: 18.07 | reduce_grads: 0.23 | step: 359.81 | _step_clipping: 0.13 | _step_step: 358.01 | _step_zero_grad: 0.52 | _step_check_overflow: 0.52 samples/sec: 16.034 | iteration 27860/ 143000 | elapsed time per iteration (ms): 63863.2 | learning rate: 5.510E-04 | approx flops per GPU: 69.2TFLOPS | lm_loss: 2.322328E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 08:21:42,664] [INFO] [logging.py:60:log_dist] [Rank 0] step=27870, skipped=34, lr=[0.0005509193683299002, 0.0005509193683299002], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27870 loss: 2.3158 iter time (s): 63.688 samples/sec: 16.078 %comms: 0.0028566228191186124 %optimizer_step 0.058932052390139066 %forward: 22.85895185693073 %backward: 61.30629183724267 [2025-04-14 08:21:42,665] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26832.11 | forward: 145583.98 | backward_microstep: 390459.36 | backward: 390447.20 | backward_inner_microstep: 390428.57 | backward_inner: 390421.64 | backward_allreduce_microstep: 8.95 | backward_allreduce: 3.10 | reduce_tied_grads: 0.37 | comms: 18.19 | reduce_grads: 0.26 | step: 375.33 | _step_clipping: 0.15 | _step_step: 373.21 | _step_zero_grad: 0.56 | _step_check_overflow: 0.72 samples/sec: 16.078 | iteration 27870/ 143000 | elapsed time per iteration (ms): 63688.5 | learning rate: 5.509E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.314724E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 08:32:19,322] [INFO] [logging.py:60:log_dist] [Rank 0] step=27880, skipped=34, lr=[0.0005508832368530781, 0.0005508832368530781], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27880 loss: 2.3239 iter time (s): 63.665 samples/sec: 16.084 %comms: 0.0028552502563710946 %optimizer_step 0.059442936945076366 %forward: 22.862732873999697 %backward: 61.32176033241159 [2025-04-14 08:32:19,323] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26668.57 | forward: 145555.87 | backward_microstep: 390418.38 | backward: 390405.75 | backward_inner_microstep: 390385.89 | backward_inner: 390378.56 | backward_allreduce_microstep: 9.52 | backward_allreduce: 3.31 | reduce_tied_grads: 0.38 | comms: 18.18 | reduce_grads: 0.25 | step: 378.44 | _step_clipping: 0.14 | _step_step: 376.28 | _step_zero_grad: 0.66 | _step_check_overflow: 0.67 samples/sec: 16.084 | iteration 27880/ 143000 | elapsed time per iteration (ms): 63665.8 | learning rate: 5.509E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.310044E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 08:42:54,528] [INFO] [logging.py:60:log_dist] [Rank 0] step=27890, skipped=34, lr=[0.0005508470932675061, 0.0005508470932675061], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27890 loss: 2.3158 iter time (s): 63.520 samples/sec: 16.121 %comms: 0.002853856233082483 %optimizer_step 0.05806717063984357 %forward: 22.94674698826574 %backward: 61.46722356963758 [2025-04-14 08:42:54,529] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24984.11 | forward: 145757.62 | backward_microstep: 390454.02 | backward: 390439.48 | backward_inner_microstep: 390419.12 | backward_inner: 390411.89 | backward_allreduce_microstep: 9.97 | backward_allreduce: 3.57 | reduce_tied_grads: 0.36 | comms: 18.13 | reduce_grads: 0.23 | step: 368.84 | _step_clipping: 0.12 | _step_step: 366.93 | _step_zero_grad: 0.61 | _step_check_overflow: 0.53 samples/sec: 16.121 | iteration 27890/ 143000 | elapsed time per iteration (ms): 63520.6 | learning rate: 5.508E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.317382E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 08:53:29,812] [INFO] [logging.py:60:log_dist] [Rank 0] step=27900, skipped=34, lr=[0.0005508109375749285, 0.0005508109375749285], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27900 loss: 2.3049 iter time (s): 63.528 samples/sec: 16.119 %comms: 0.0028601483123771258 %optimizer_step 0.057823484449836746 %forward: 22.918533483268405 %backward: 61.45168489040257 [2025-04-14 08:53:29,813] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25292.19 | forward: 145596.30 | backward_microstep: 390400.28 | backward: 390388.75 | backward_inner_microstep: 390370.25 | backward_inner: 390363.39 | backward_allreduce_microstep: 8.92 | backward_allreduce: 3.07 | reduce_tied_grads: 0.37 | comms: 18.17 | reduce_grads: 0.24 | step: 367.34 | _step_clipping: 0.14 | _step_step: 365.43 | _step_zero_grad: 0.55 | _step_check_overflow: 0.56 samples/sec: 16.119 | iteration 27900/ 143000 | elapsed time per iteration (ms): 63528.4 | learning rate: 5.508E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.309298E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 09:04:04,426] [INFO] [logging.py:60:log_dist] [Rank 0] step=27910, skipped=34, lr=[0.0005507747697770903, 0.0005507747697770903], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27910 loss: 2.3457 iter time (s): 63.461 samples/sec: 16.136 %comms: 0.002836114509912017 %optimizer_step 0.0564707254511431 %forward: 22.91446190838402 %backward: 61.50088726221322 [2025-04-14 09:04:04,426] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24926.92 | forward: 145417.08 | backward_microstep: 390301.04 | backward: 390289.75 | backward_inner_microstep: 390270.90 | backward_inner: 390263.99 | backward_allreduce_microstep: 9.06 | backward_allreduce: 3.17 | reduce_tied_grads: 0.30 | comms: 18.00 | reduce_grads: 0.19 | step: 358.37 | _step_clipping: 0.11 | _step_step: 356.67 | _step_zero_grad: 0.53 | _step_check_overflow: 0.46 samples/sec: 16.136 | iteration 27910/ 143000 | elapsed time per iteration (ms): 63461.4 | learning rate: 5.508E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.309688E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 09:14:41,221] [INFO] [logging.py:60:log_dist] [Rank 0] step=27920, skipped=34, lr=[0.0005507385898757374, 0.0005507385898757374], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27920 loss: 2.3179 iter time (s): 63.679 samples/sec: 16.081 %comms: 0.002851710019061944 %optimizer_step 0.057233235258498195 %forward: 22.856247782130982 %backward: 61.319626126821156 [2025-04-14 09:14:41,222] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26756.46 | forward: 145546.18 | backward_microstep: 390490.56 | backward: 390476.93 | backward_inner_microstep: 390457.96 | backward_inner: 390450.78 | backward_allreduce_microstep: 9.11 | backward_allreduce: 3.11 | reduce_tied_grads: 0.33 | comms: 18.16 | reduce_grads: 0.23 | step: 364.46 | _step_clipping: 0.16 | _step_step: 362.46 | _step_zero_grad: 0.57 | _step_check_overflow: 0.62 samples/sec: 16.081 | iteration 27920/ 143000 | elapsed time per iteration (ms): 63679.6 | learning rate: 5.507E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.328860E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 09:25:25,859] [INFO] [logging.py:60:log_dist] [Rank 0] step=27930, skipped=34, lr=[0.0005507023978726156, 0.0005507023978726156], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27930 loss: 2.3279 iter time (s): 64.463 samples/sec: 15.885 %comms: 0.0028423209041652034 %optimizer_step 0.05727300662342952 %forward: 22.611093893381867 %backward: 60.58902839661513 [2025-04-14 09:25:25,859] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 34271.77 | forward: 145758.01 | backward_microstep: 390591.61 | backward: 390575.36 | backward_inner_microstep: 390554.88 | backward_inner: 390547.13 | backward_allreduce_microstep: 9.83 | backward_allreduce: 3.39 | reduce_tied_grads: 0.35 | comms: 18.32 | reduce_grads: 0.23 | step: 369.20 | _step_clipping: 0.14 | _step_step: 367.26 | _step_zero_grad: 0.63 | _step_check_overflow: 0.49 samples/sec: 15.885 | iteration 27930/ 143000 | elapsed time per iteration (ms): 64463.7 | learning rate: 5.507E-04 | approx flops per GPU: 68.5TFLOPS | lm_loss: 2.323735E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 09:36:09,503] [INFO] [logging.py:60:log_dist] [Rank 0] step=27940, skipped=34, lr=[0.0005506661937694719, 0.0005506661937694719], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27940 loss: 2.3195 iter time (s): 64.364 samples/sec: 15.910 %comms: 0.0028411830999065203 %optimizer_step 0.05718597294453509 %forward: 22.672362060403994 %backward: 60.71750149183042 [2025-04-14 09:36:09,503] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 32818.83 | forward: 145928.01 | backward_microstep: 390819.60 | backward: 390801.11 | backward_inner_microstep: 390779.98 | backward_inner: 390772.40 | backward_allreduce_microstep: 9.04 | backward_allreduce: 3.10 | reduce_tied_grads: 0.36 | comms: 18.29 | reduce_grads: 0.26 | step: 368.07 | _step_clipping: 0.13 | _step_step: 366.00 | _step_zero_grad: 0.57 | _step_check_overflow: 0.68 samples/sec: 15.909 | iteration 27940/ 143000 | elapsed time per iteration (ms): 64364.4 | learning rate: 5.507E-04 | approx flops per GPU: 68.6TFLOPS | lm_loss: 2.319606E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 09:46:49,816] [INFO] [logging.py:60:log_dist] [Rank 0] step=27950, skipped=34, lr=[0.0005506299775680538, 0.0005506299775680538], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27950 loss: 2.3162 iter time (s): 64.031 samples/sec: 15.992 %comms: 0.0028731675661075352 %optimizer_step 0.05625496249560317 %forward: 22.752748821328648 %backward: 60.99557027436586 [2025-04-14 09:46:49,817] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 30028.20 | forward: 145687.44 | backward_microstep: 390573.89 | backward: 390558.90 | backward_inner_microstep: 390540.74 | backward_inner: 390533.67 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.95 | reduce_tied_grads: 0.37 | comms: 18.40 | reduce_grads: 0.22 | step: 360.20 | _step_clipping: 0.13 | _step_step: 358.29 | _step_zero_grad: 0.54 | _step_check_overflow: 0.60 samples/sec: 15.992 | iteration 27950/ 143000 | elapsed time per iteration (ms): 64031.3 | learning rate: 5.506E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.323107E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 09:57:35,896] [INFO] [logging.py:60:log_dist] [Rank 0] step=27960, skipped=34, lr=[0.000550593749270109, 0.000550593749270109], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27960 loss: 2.3490 iter time (s): 64.607 samples/sec: 15.850 %comms: 0.002801173265556594 %optimizer_step 0.05762710905186762 %forward: 22.5712428806259 %backward: 60.46124425209262 [2025-04-14 09:57:35,896] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35540.24 | forward: 145826.83 | backward_microstep: 390642.50 | backward: 390624.10 | backward_inner_microstep: 390603.85 | backward_inner: 390592.39 | backward_allreduce_microstep: 9.42 | backward_allreduce: 3.24 | reduce_tied_grads: 0.37 | comms: 18.10 | reduce_grads: 0.24 | step: 372.31 | _step_clipping: 0.13 | _step_step: 370.48 | _step_zero_grad: 0.57 | _step_check_overflow: 0.48 samples/sec: 15.849 | iteration 27960/ 143000 | elapsed time per iteration (ms): 64608.0 | learning rate: 5.506E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.331901E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 10:08:15,727] [INFO] [logging.py:60:log_dist] [Rank 0] step=27970, skipped=34, lr=[0.0005505575088773863, 0.0005505575088773863], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27970 loss: 2.3111 iter time (s): 63.983 samples/sec: 16.004 %comms: 0.002857221383233774 %optimizer_step 0.05780674169450969 %forward: 22.818008301181123 %backward: 61.02809098981334 [2025-04-14 10:08:15,728] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29326.72 | forward: 145995.36 | backward_microstep: 390489.19 | backward: 390473.08 | backward_inner_microstep: 390451.67 | backward_inner: 390444.43 | backward_allreduce_microstep: 9.24 | backward_allreduce: 3.20 | reduce_tied_grads: 0.38 | comms: 18.28 | reduce_grads: 0.23 | step: 369.86 | _step_clipping: 0.14 | _step_step: 367.94 | _step_zero_grad: 0.57 | _step_check_overflow: 0.56 samples/sec: 16.004 | iteration 27970/ 143000 | elapsed time per iteration (ms): 63983.1 | learning rate: 5.506E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.322470E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 10:18:49,071] [INFO] [logging.py:60:log_dist] [Rank 0] step=27980, skipped=34, lr=[0.0005505212563916346, 0.0005505212563916346], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27980 loss: 2.3238 iter time (s): 63.334 samples/sec: 16.168 %comms: 0.002872032701975559 %optimizer_step 0.05713934527269623 %forward: 22.98849119807054 %backward: 61.63893842436009 [2025-04-14 10:18:49,071] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23352.39 | forward: 145594.80 | backward_microstep: 390396.87 | backward: 390382.69 | backward_inner_microstep: 390364.84 | backward_inner: 390357.96 | backward_allreduce_microstep: 8.62 | backward_allreduce: 3.00 | reduce_tied_grads: 0.31 | comms: 18.19 | reduce_grads: 0.21 | step: 361.89 | _step_clipping: 0.11 | _step_step: 360.16 | _step_zero_grad: 0.51 | _step_check_overflow: 0.51 samples/sec: 16.168 | iteration 27980/ 143000 | elapsed time per iteration (ms): 63334.4 | learning rate: 5.505E-04 | approx flops per GPU: 69.7TFLOPS | lm_loss: 2.316448E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 10:29:21,012] [INFO] [logging.py:60:log_dist] [Rank 0] step=27990, skipped=34, lr=[0.0005504849918146037, 0.0005504849918146037], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 27990 loss: 2.3133 iter time (s): 63.194 samples/sec: 16.204 %comms: 0.0032097749731560513 %optimizer_step 0.05696109315461249 %forward: 23.044249711942165 %backward: 61.79871620515759 [2025-04-14 10:29:21,013] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21731.86 | forward: 145624.73 | backward_microstep: 390543.07 | backward: 390527.86 | backward_inner_microstep: 390510.26 | backward_inner: 390501.75 | backward_allreduce_microstep: 8.24 | backward_allreduce: 2.84 | reduce_tied_grads: 2.25 | comms: 20.28 | reduce_grads: 0.21 | step: 359.96 | _step_clipping: 0.12 | _step_step: 358.02 | _step_zero_grad: 0.56 | _step_check_overflow: 0.62 samples/sec: 16.204 | iteration 27990/ 143000 | elapsed time per iteration (ms): 63194.1 | learning rate: 5.505E-04 | approx flops per GPU: 69.9TFLOPS | lm_loss: 2.310219E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 10:39:51,795] [INFO] [logging.py:60:log_dist] [Rank 0] step=28000, skipped=34, lr=[0.0005504487151480438, 0.0005504487151480438], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28000 loss: 2.3171 iter time (s): 63.078 samples/sec: 16.234 %comms: 0.0028750374359817433 %optimizer_step 0.057074240484101396 %forward: 23.08854280958372 %backward: 61.897250985013095 [2025-04-14 10:39:51,795] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20691.82 | forward: 145637.18 | backward_microstep: 390445.55 | backward: 390433.53 | backward_inner_microstep: 390415.71 | backward_inner: 390408.90 | backward_allreduce_microstep: 8.50 | backward_allreduce: 2.88 | reduce_tied_grads: 0.34 | comms: 18.14 | reduce_grads: 0.22 | step: 360.01 | _step_clipping: 0.13 | _step_step: 358.18 | _step_zero_grad: 0.51 | _step_check_overflow: 0.56 samples/sec: 16.234 | iteration 28000/ 143000 | elapsed time per iteration (ms): 63078.3 | learning rate: 5.504E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.313996E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 10:39:54,743] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: ../checkpoints/mydata_left-pythia160m/global_step28000/mp_rank_00_model_states.pt [2025-04-14 10:40:08,559] [INFO] [engine.py:1805:_copy_recovery_script] creating recovery script ../checkpoints/mydata_left-pythia160m/zero_to_fp32.py [2025-04-14 10:40:08,565] [INFO] [engine.py:1818:_save_zero_checkpoint] zero checkpoint saved ../checkpoints/mydata_left-pythia160m/global_step28000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2025-04-14 10:50:42,305] [INFO] [logging.py:60:log_dist] [Rank 0] step=28010, skipped=34, lr=[0.000550412426393706, 0.000550412426393706], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28010 loss: 2.3090 iter time (s): 63.373 samples/sec: 16.158 %comms: 0.00289698704506204 %optimizer_step 0.05695469081486937 %forward: 23.011866691433312 %backward: 61.64246569695597 [2025-04-14 10:50:42,305] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23196.79 | forward: 145832.07 | backward_microstep: 390663.37 | backward: 390644.03 | backward_inner_microstep: 390623.90 | backward_inner: 390616.69 | backward_allreduce_microstep: 8.55 | backward_allreduce: 2.95 | reduce_tied_grads: 0.35 | comms: 18.36 | reduce_grads: 0.25 | step: 360.94 | _step_clipping: 0.12 | _step_step: 359.12 | _step_zero_grad: 0.51 | _step_check_overflow: 0.54 samples/sec: 15.741 | iteration 28010/ 143000 | elapsed time per iteration (ms): 65051.0 | learning rate: 5.504E-04 | approx flops per GPU: 67.9TFLOPS | lm_loss: 2.304515E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 11:01:15,140] [INFO] [logging.py:60:log_dist] [Rank 0] step=28020, skipped=34, lr=[0.0005503761255533416, 0.0005503761255533416], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28020 loss: 2.3112 iter time (s): 63.283 samples/sec: 16.181 %comms: 0.0028933293203146637 %optimizer_step 0.056689580777219875 %forward: 23.013920650640245 %backward: 61.701135530129825 [2025-04-14 11:01:15,141] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22707.40 | forward: 145638.80 | backward_microstep: 390478.89 | backward: 390462.78 | backward_inner_microstep: 390445.18 | backward_inner: 390438.30 | backward_allreduce_microstep: 8.18 | backward_allreduce: 2.80 | reduce_tied_grads: 0.31 | comms: 18.31 | reduce_grads: 0.20 | step: 358.75 | _step_clipping: 0.12 | _step_step: 356.79 | _step_zero_grad: 0.56 | _step_check_overflow: 0.63 samples/sec: 16.181 | iteration 28020/ 143000 | elapsed time per iteration (ms): 63283.5 | learning rate: 5.504E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.304157E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 11:11:50,549] [INFO] [logging.py:60:log_dist] [Rank 0] step=28030, skipped=34, lr=[0.0005503398126287027, 0.0005503398126287027], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28030 loss: 2.3052 iter time (s): 63.540 samples/sec: 16.116 %comms: 0.002829565881368516 %optimizer_step 0.057580183550160216 %forward: 22.937508998111433 %backward: 61.43071383560871 [2025-04-14 11:11:50,549] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25331.36 | forward: 145745.61 | backward_microstep: 390346.28 | backward: 390332.57 | backward_inner_microstep: 390314.27 | backward_inner: 390307.06 | backward_allreduce_microstep: 8.58 | backward_allreduce: 2.96 | reduce_tied_grads: 0.31 | comms: 17.98 | reduce_grads: 0.25 | step: 365.87 | _step_clipping: 0.12 | _step_step: 364.13 | _step_zero_grad: 0.51 | _step_check_overflow: 0.52 samples/sec: 16.116 | iteration 28030/ 143000 | elapsed time per iteration (ms): 63540.9 | learning rate: 5.503E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.308625E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 11:22:28,005] [INFO] [logging.py:60:log_dist] [Rank 0] step=28040, skipped=34, lr=[0.0005503034876215417, 0.0005503034876215417], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28040 loss: 2.3154 iter time (s): 63.745 samples/sec: 16.064 %comms: 0.0028541775296710447 %optimizer_step 0.05581439141434674 %forward: 22.84470420809098 %backward: 61.23512062387043 [2025-04-14 11:22:28,006] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27500.07 | forward: 145623.61 | backward_microstep: 390359.56 | backward: 390343.39 | backward_inner_microstep: 390326.22 | backward_inner: 390319.35 | backward_allreduce_microstep: 7.92 | backward_allreduce: 2.72 | reduce_tied_grads: 0.30 | comms: 18.19 | reduce_grads: 0.19 | step: 355.79 | _step_clipping: 0.12 | _step_step: 354.03 | _step_zero_grad: 0.53 | _step_check_overflow: 0.51 samples/sec: 16.064 | iteration 28040/ 143000 | elapsed time per iteration (ms): 63745.6 | learning rate: 5.503E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.306420E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 11:33:01,238] [INFO] [logging.py:60:log_dist] [Rank 0] step=28050, skipped=34, lr=[0.0005502671505336122, 0.0005502671505336122], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28050 loss: 2.3134 iter time (s): 63.323 samples/sec: 16.171 %comms: 0.0028728349732930285 %optimizer_step 0.05571724516079033 %forward: 22.967725426706835 %backward: 61.62563968522432 [2025-04-14 11:33:01,239] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23591.33 | forward: 145437.91 | backward_microstep: 390243.23 | backward: 390230.39 | backward_inner_microstep: 390210.56 | backward_inner: 390204.12 | backward_allreduce_microstep: 7.70 | backward_allreduce: 2.66 | reduce_tied_grads: 0.31 | comms: 18.19 | reduce_grads: 0.19 | step: 352.82 | _step_clipping: 0.12 | _step_step: 351.05 | _step_zero_grad: 0.48 | _step_check_overflow: 0.55 samples/sec: 16.171 | iteration 28050/ 143000 | elapsed time per iteration (ms): 63323.3 | learning rate: 5.503E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.318457E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 11:43:32,172] [INFO] [logging.py:60:log_dist] [Rank 0] step=28060, skipped=34, lr=[0.0005502308013666675, 0.0005502308013666675], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28060 loss: 2.3505 iter time (s): 63.093 samples/sec: 16.230 %comms: 0.0029494696293996133 %optimizer_step 0.05652864189874826 %forward: 23.04877900340726 %backward: 61.85341368038979 [2025-04-14 11:43:32,173] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 21281.03 | forward: 145421.33 | backward_microstep: 390264.02 | backward: 390250.87 | backward_inner_microstep: 390233.35 | backward_inner: 390226.67 | backward_allreduce_microstep: 8.39 | backward_allreduce: 2.85 | reduce_tied_grads: 0.34 | comms: 18.61 | reduce_grads: 0.22 | step: 356.66 | _step_clipping: 0.12 | _step_step: 354.47 | _step_zero_grad: 0.56 | _step_check_overflow: 0.86 samples/sec: 16.230 | iteration 28060/ 143000 | elapsed time per iteration (ms): 63093.5 | learning rate: 5.502E-04 | approx flops per GPU: 70.0TFLOPS | lm_loss: 2.323395E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 11:54:18,760] [INFO] [logging.py:60:log_dist] [Rank 0] step=28070, skipped=34, lr=[0.0005501944401224625, 0.0005501944401224625], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28070 loss: 2.3186 iter time (s): 64.658 samples/sec: 15.837 %comms: 0.0028108853359516206 %optimizer_step 0.057512601909149336 %forward: 22.49900282524471 %backward: 60.336501795388756 [2025-04-14 11:54:18,761] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 36988.85 | forward: 145474.26 | backward_microstep: 390136.14 | backward: 390124.31 | backward_inner_microstep: 390102.23 | backward_inner: 390095.10 | backward_allreduce_microstep: 11.64 | backward_allreduce: 3.44 | reduce_tied_grads: 0.38 | comms: 18.17 | reduce_grads: 0.26 | step: 371.87 | _step_clipping: 0.14 | _step_step: 369.77 | _step_zero_grad: 0.62 | _step_check_overflow: 0.65 samples/sec: 15.837 | iteration 28070/ 143000 | elapsed time per iteration (ms): 64658.8 | learning rate: 5.502E-04 | approx flops per GPU: 68.3TFLOPS | lm_loss: 2.319751E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 12:04:54,782] [INFO] [logging.py:60:log_dist] [Rank 0] step=28080, skipped=34, lr=[0.0005501580668027519, 0.0005501580668027519], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28080 loss: 2.3097 iter time (s): 63.602 samples/sec: 16.100 %comms: 0.002917670658464947 %optimizer_step 0.05770343833154215 %forward: 22.86015004828259 %backward: 61.33071496123683 [2025-04-14 12:04:54,783] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26611.87 | forward: 145394.06 | backward_microstep: 390083.03 | backward: 390072.76 | backward_inner_microstep: 390053.67 | backward_inner: 390046.77 | backward_allreduce_microstep: 9.37 | backward_allreduce: 3.21 | reduce_tied_grads: 0.36 | comms: 18.56 | reduce_grads: 0.25 | step: 367.00 | _step_clipping: 0.17 | _step_step: 364.91 | _step_zero_grad: 0.62 | _step_check_overflow: 0.60 samples/sec: 16.100 | iteration 28080/ 143000 | elapsed time per iteration (ms): 63602.2 | learning rate: 5.502E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.311747E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 12:15:27,763] [INFO] [logging.py:60:log_dist] [Rank 0] step=28090, skipped=34, lr=[0.0005501216814092913, 0.0005501216814092913], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28090 loss: 2.3099 iter time (s): 63.298 samples/sec: 16.178 %comms: 0.0028798536568937277 %optimizer_step 0.05656079740369115 %forward: 22.995414793636666 %backward: 61.660551958702406 [2025-04-14 12:15:27,764] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 23151.55 | forward: 145555.35 | backward_microstep: 390306.37 | backward: 390296.21 | backward_inner_microstep: 390279.19 | backward_inner: 390272.58 | backward_allreduce_microstep: 8.19 | backward_allreduce: 2.82 | reduce_tied_grads: 0.30 | comms: 18.23 | reduce_grads: 0.20 | step: 358.02 | _step_clipping: 0.12 | _step_step: 356.19 | _step_zero_grad: 0.53 | _step_check_overflow: 0.57 samples/sec: 16.177 | iteration 28090/ 143000 | elapsed time per iteration (ms): 63298.1 | learning rate: 5.501E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.311167E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 12:26:04,247] [INFO] [logging.py:60:log_dist] [Rank 0] step=28100, skipped=34, lr=[0.0005500852839438366, 0.0005500852839438366], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28100 loss: 2.3047 iter time (s): 63.648 samples/sec: 16.089 %comms: 0.002898471194125945 %optimizer_step 0.058585138772187 %forward: 22.879869030232094 %backward: 61.35183364449167 [2025-04-14 12:26:04,248] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 26341.34 | forward: 145625.20 | backward_microstep: 390504.64 | backward: 390490.56 | backward_inner_microstep: 390471.25 | backward_inner: 390464.03 | backward_allreduce_microstep: 9.16 | backward_allreduce: 3.15 | reduce_tied_grads: 0.36 | comms: 18.45 | reduce_grads: 0.25 | step: 372.88 | _step_clipping: 0.12 | _step_step: 370.97 | _step_zero_grad: 0.52 | _step_check_overflow: 0.60 samples/sec: 16.088 | iteration 28100/ 143000 | elapsed time per iteration (ms): 63648.4 | learning rate: 5.501E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.304888E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 12:36:34,669] [INFO] [logging.py:60:log_dist] [Rank 0] step=28110, skipped=34, lr=[0.0005500488744081448, 0.0005500488744081448], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28110 loss: 2.3080 iter time (s): 63.042 samples/sec: 16.243 %comms: 0.0029217247184531057 %optimizer_step 0.057660901991663134 %forward: 23.079340653832517 %backward: 61.93180713946216 [2025-04-14 12:36:34,670] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 20494.63 | forward: 145495.91 | backward_microstep: 390440.63 | backward: 390428.17 | backward_inner_microstep: 390409.65 | backward_inner: 390402.64 | backward_allreduce_microstep: 8.80 | backward_allreduce: 3.01 | reduce_tied_grads: 0.38 | comms: 18.42 | reduce_grads: 0.24 | step: 363.50 | _step_clipping: 0.16 | _step_step: 361.57 | _step_zero_grad: 0.55 | _step_check_overflow: 0.58 samples/sec: 16.243 | iteration 28110/ 143000 | elapsed time per iteration (ms): 63042.2 | learning rate: 5.500E-04 | approx flops per GPU: 70.1TFLOPS | lm_loss: 2.307347E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 12:47:07,233] [INFO] [logging.py:60:log_dist] [Rank 0] step=28120, skipped=34, lr=[0.000550012452803973, 0.000550012452803973], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28120 loss: 2.3187 iter time (s): 63.256 samples/sec: 16.188 %comms: 0.002881755155477198 %optimizer_step 0.056651664634104024 %forward: 23.02408485301954 %backward: 61.71546133431319 [2025-04-14 12:47:07,234] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 22538.26 | forward: 145640.66 | backward_microstep: 390398.22 | backward: 390386.01 | backward_inner_microstep: 390367.43 | backward_inner: 390360.33 | backward_allreduce_microstep: 8.97 | backward_allreduce: 3.05 | reduce_tied_grads: 0.34 | comms: 18.23 | reduce_grads: 0.23 | step: 358.35 | _step_clipping: 0.13 | _step_step: 356.47 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.188 | iteration 28120/ 143000 | elapsed time per iteration (ms): 63256.4 | learning rate: 5.500E-04 | approx flops per GPU: 69.8TFLOPS | lm_loss: 2.316447E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 12:57:53,482] [INFO] [logging.py:60:log_dist] [Rank 0] step=28130, skipped=34, lr=[0.0005499760191330792, 0.0005499760191330792], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28130 loss: 2.3130 iter time (s): 64.624 samples/sec: 15.845 %comms: 0.002852606676681892 %optimizer_step 0.05945040243373462 %forward: 22.56948044103394 %backward: 60.483060512764794 [2025-04-14 12:57:53,483] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 35427.14 | forward: 145853.62 | backward_microstep: 390886.01 | backward: 390867.37 | backward_inner_microstep: 390847.05 | backward_inner: 390839.22 | backward_allreduce_microstep: 9.47 | backward_allreduce: 3.21 | reduce_tied_grads: 0.36 | comms: 18.43 | reduce_grads: 0.24 | step: 384.19 | _step_clipping: 0.13 | _step_step: 382.25 | _step_zero_grad: 0.61 | _step_check_overflow: 0.53 samples/sec: 15.845 | iteration 28130/ 143000 | elapsed time per iteration (ms): 64624.9 | learning rate: 5.500E-04 | approx flops per GPU: 68.4TFLOPS | lm_loss: 2.318086E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 13:08:51,327] [INFO] [logging.py:60:log_dist] [Rank 0] step=28140, skipped=34, lr=[0.0005499395733972217, 0.0005499395733972217], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28140 loss: 2.3024 iter time (s): 65.783 samples/sec: 15.566 %comms: 0.0027592938249887695 %optimizer_step 0.055676390240563194 %forward: 22.183629877980746 %backward: 59.36021580975106 [2025-04-14 13:08:51,328] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 47401.77 | forward: 145931.05 | backward_microstep: 390504.84 | backward: 390490.58 | backward_inner_microstep: 390469.01 | backward_inner: 390461.61 | backward_allreduce_microstep: 9.27 | backward_allreduce: 3.22 | reduce_tied_grads: 0.34 | comms: 18.15 | reduce_grads: 0.21 | step: 366.26 | _step_clipping: 0.15 | _step_step: 364.38 | _step_zero_grad: 0.57 | _step_check_overflow: 0.52 samples/sec: 15.566 | iteration 28140/ 143000 | elapsed time per iteration (ms): 65784.5 | learning rate: 5.499E-04 | approx flops per GPU: 67.2TFLOPS | lm_loss: 2.307866E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 13:19:26,460] [INFO] [logging.py:60:log_dist] [Rank 0] step=28150, skipped=34, lr=[0.0005499031155981598, 0.0005499031155981598], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28150 loss: 2.3149 iter time (s): 63.513 samples/sec: 16.123 %comms: 0.002883353505252183 %optimizer_step 0.057090962485656795 %forward: 22.933601456404567 %backward: 61.498425233828456 [2025-04-14 13:19:26,461] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24857.99 | forward: 145657.29 | backward_microstep: 390609.30 | backward: 390592.56 | backward_inner_microstep: 390573.83 | backward_inner: 390566.49 | backward_allreduce_microstep: 8.81 | backward_allreduce: 3.04 | reduce_tied_grads: 0.32 | comms: 18.31 | reduce_grads: 0.23 | step: 362.60 | _step_clipping: 0.13 | _step_step: 360.74 | _step_zero_grad: 0.55 | _step_check_overflow: 0.52 samples/sec: 16.123 | iteration 28150/ 143000 | elapsed time per iteration (ms): 63513.3 | learning rate: 5.499E-04 | approx flops per GPU: 69.6TFLOPS | lm_loss: 2.312246E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 13:30:06,216] [INFO] [logging.py:60:log_dist] [Rank 0] step=28160, skipped=34, lr=[0.0005498666457376528, 0.0005498666457376528], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28160 loss: 2.3158 iter time (s): 63.975 samples/sec: 16.006 %comms: 0.002904926953780422 %optimizer_step 0.056935234117658835 %forward: 22.776006140538854 %backward: 61.067815110538874 [2025-04-14 13:30:06,216] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29327.78 | forward: 145709.35 | backward_microstep: 390697.10 | backward: 390680.95 | backward_inner_microstep: 390662.08 | backward_inner: 390654.72 | backward_allreduce_microstep: 8.89 | backward_allreduce: 3.06 | reduce_tied_grads: 0.37 | comms: 18.58 | reduce_grads: 0.25 | step: 364.24 | _step_clipping: 0.11 | _step_step: 362.07 | _step_zero_grad: 0.63 | _step_check_overflow: 0.58 samples/sec: 16.006 | iteration 28160/ 143000 | elapsed time per iteration (ms): 63975.6 | learning rate: 5.499E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.310975E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 13:40:41,711] [INFO] [logging.py:60:log_dist] [Rank 0] step=28170, skipped=34, lr=[0.000549830163817461, 0.000549830163817461], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28170 loss: 2.3316 iter time (s): 63.549 samples/sec: 16.114 %comms: 0.002899001330564738 %optimizer_step 0.056311668622283574 %forward: 22.968694536612198 %backward: 61.49842493221972 [2025-04-14 13:40:41,712] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 24626.25 | forward: 145963.59 | backward_microstep: 390834.69 | backward: 390815.88 | backward_inner_microstep: 390797.00 | backward_inner: 390787.66 | backward_allreduce_microstep: 8.75 | backward_allreduce: 3.01 | reduce_tied_grads: 0.33 | comms: 18.42 | reduce_grads: 0.22 | step: 357.85 | _step_clipping: 0.15 | _step_step: 355.83 | _step_zero_grad: 0.59 | _step_check_overflow: 0.63 samples/sec: 16.113 | iteration 28170/ 143000 | elapsed time per iteration (ms): 63549.6 | learning rate: 5.498E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.316314E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 13:51:18,362] [INFO] [logging.py:60:log_dist] [Rank 0] step=28180, skipped=34, lr=[0.0005497936698393453, 0.0005497936698393453], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28180 loss: 2.3113 iter time (s): 63.664 samples/sec: 16.084 %comms: 0.002858279252495847 %optimizer_step 0.05663305621271644 %forward: 22.900426340939294 %backward: 61.39841073501919 [2025-04-14 13:51:18,362] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25880.17 | forward: 145794.16 | backward_microstep: 390912.52 | backward: 390889.22 | backward_inner_microstep: 390866.66 | backward_inner: 390858.62 | backward_allreduce_microstep: 8.96 | backward_allreduce: 3.08 | reduce_tied_grads: 0.32 | comms: 18.20 | reduce_grads: 0.20 | step: 360.55 | _step_clipping: 0.13 | _step_step: 358.63 | _step_zero_grad: 0.53 | _step_check_overflow: 0.60 samples/sec: 16.084 | iteration 28180/ 143000 | elapsed time per iteration (ms): 63665.0 | learning rate: 5.498E-04 | approx flops per GPU: 69.4TFLOPS | lm_loss: 2.320669E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 14:01:58,211] [INFO] [logging.py:60:log_dist] [Rank 0] step=28190, skipped=34, lr=[0.000549757163805067, 0.000549757163805067], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28190 loss: 2.3103 iter time (s): 63.984 samples/sec: 16.004 %comms: 0.002930584455745944 %optimizer_step 0.05757386694192737 %forward: 22.813327564042662 %backward: 61.05591642297461 [2025-04-14 14:01:58,211] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 29138.01 | forward: 145969.52 | backward_microstep: 390682.61 | backward: 390662.11 | backward_inner_microstep: 390641.97 | backward_inner: 390634.09 | backward_allreduce_microstep: 9.54 | backward_allreduce: 3.25 | reduce_tied_grads: 0.39 | comms: 18.75 | reduce_grads: 0.27 | step: 368.38 | _step_clipping: 0.15 | _step_step: 366.30 | _step_zero_grad: 0.59 | _step_check_overflow: 0.63 samples/sec: 16.004 | iteration 28190/ 143000 | elapsed time per iteration (ms): 63984.9 | learning rate: 5.498E-04 | approx flops per GPU: 69.0TFLOPS | lm_loss: 2.308169E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 14:12:35,763] [INFO] [logging.py:60:log_dist] [Rank 0] step=28200, skipped=34, lr=[0.000549720645716388, 0.000549720645716388], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28200 loss: 2.3172 iter time (s): 63.755 samples/sec: 16.062 %comms: 0.0028372938779303762 %optimizer_step 0.058154109630956956 %forward: 22.848435496556313 %backward: 61.2380950132545 [2025-04-14 14:12:35,764] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 27426.39 | forward: 145669.31 | backward_microstep: 390433.18 | backward: 390421.08 | backward_inner_microstep: 390398.78 | backward_inner: 390391.76 | backward_allreduce_microstep: 9.00 | backward_allreduce: 3.07 | reduce_tied_grads: 0.34 | comms: 18.09 | reduce_grads: 0.20 | step: 370.76 | _step_clipping: 0.13 | _step_step: 368.92 | _step_zero_grad: 0.55 | _step_check_overflow: 0.51 samples/sec: 16.061 | iteration 28200/ 143000 | elapsed time per iteration (ms): 63755.2 | learning rate: 5.497E-04 | approx flops per GPU: 69.3TFLOPS | lm_loss: 2.313140E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) [2025-04-14 14:23:11,161] [INFO] [logging.py:60:log_dist] [Rank 0] step=28210, skipped=34, lr=[0.0005496841155750708, 0.0005496841155750708], mom=[[0.9, 0.95], [0.9, 0.95]] steps: 28210 loss: 2.3121 iter time (s): 63.539 samples/sec: 16.116 %comms: 0.0028849238816357247 %optimizer_step 0.05727915730108139 %forward: 22.917573854947232 %backward: 61.45928247592639 [2025-04-14 14:23:11,161] [INFO] [logging.py:60:log_dist] [Rank 0] rank=0 time (ms) | train_batch: 0.00 | batch_input: 25211.06 | forward: 145616.42 | backward_microstep: 390522.90 | backward: 390507.33 | backward_inner_microstep: 390487.80 | backward_inner: 390480.37 | backward_allreduce_microstep: 9.36 | backward_allreduce: 3.14 | reduce_tied_grads: 0.35 | comms: 18.33 | reduce_grads: 0.22 | step: 363.95 | _step_clipping: 0.13 | _step_step: 362.09 | _step_zero_grad: 0.50 | _step_check_overflow: 0.58 samples/sec: 16.116 | iteration 28210/ 143000 | elapsed time per iteration (ms): 63539.8 | learning rate: 5.497E-04 | approx flops per GPU: 69.5TFLOPS | lm_loss: 2.311558E+00 | loss scale: 131072.0 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms)